*	char *memcpy(dest, source, len)
*		register char *dest, *source;
*		register unsigned int len;
.text
.globl _memcpy
_memcpy:
	moveq	#0,d1		; load a word but get a long
	move.w	12(a7),d1	; d1.l <- #bytes
	bra	memcpy		; skip to main routine

*	char *lmemcpy(dest, source, len)
*		register char *dest, *source;
*		register unsigned long len;
.text
.globl _lmemcpy
_lmemcpy:
	move.l	12(a7),d1	; d1.l <- #bytes fall thro' to main routine
memcpy:
	beq	lmemcpy8	; more than 0 bytes to copy?
	move.l	4(a7),a1	; YES, a1 <- destination
	move.l	8(a7),a0	;      a0 <- source

	move.l	a0,d0		; check if aligned together
	move.l	a1,d2
	eor.w	d0,d2
	btst	#0,d2		; both odd or both even?
	bne	lmemcpy9
	btst	#0,d0		; YES, do they have odd alignment?
	beq	lmemcpy0
	move.b	(a0)+,(a1)+	; YES, copy first byte
	subq.l	#1,d1		; and reduce count
lmemcpy0:
	move.l	d1,d2		; save full count value
	lsr.l	#2,d1		; convert to long count
	move.w	d1,d0		; # of extra longs to copy
	swap	d1		; # of 64Kbyte blocks to copy
	bra	lmemcpy2
lmemcpy1:
	move.l	(a0)+,(a1)+	; extra bytes copy loop
lmemcpy2:
	dbra	d0,lmemcpy1
	bra	lmemcpy6
lmemcpy3:
	move.w	#$1000,d0	; 64Kwords = 4K * 4long count
	bra	lmemcpy5
lmemcpy4:
	move.l	(a0)+,(a1)+	; 64Kword copy loop
	move.l	(a0)+,(a1)+	; 4 longs at a time
	move.l	(a0)+,(a1)+	; or, 8 words at a time
	move.l	(a0)+,(a1)+	; or, 16 bytes at a time
lmemcpy5:
	dbra	d0,lmemcpy4
lmemcpy6:
	dbra	d1,lmemcpy3
	btst	#1,d2		; extra word to copy?
	bne	lmemcpy7
	move.w	(a0)+,(a1)+	; YES, copy it
lmemcpy7:
	btst	#0,d2		; extra odd byte to copy?
	beq	lmemcpy8
	move.b	(a0)+,(a1)+	; copy last byte
lmemcpy8:
	move.l	4(a7),d0	; return destination pointer
	rts
lmemcpy9:
	move.w	d1,d0		; NON-ALIGNED, d0.w <- extra bytes to copy
	swap	d1		; 	       d1.w <- extra 64kbytes to copy
	bra	lmemcpy11
lmemcpy10:
	move.b	(a0)+,(a1)+	; extra bytes copy loop
lmemcpy11:
	dbra	d0,lmemcpy10
	bra	lmemcpy15
lmemcpy12:
	move.w	#$4000,d0	; 64kbytes = 16k * 4byte count
	bra	lmemcpy14
lmemcpy13:
	move.b	(a0)+,(a1)+	; 64kbyte copy loop
	move.b	(a0)+,(a1)+
	move.b	(a0)+,(a1)+
	move.b	(a0)+,(a1)+
lmemcpy14:
	dbra	d0,lmemcpy13
lmemcpy15:
	dbra	d1,lmemcpy12
	bra	lmemcpy8	; jump to end code
