***************************************************************** :ts=8 *****
*
*	SCROLL.S
*
*	(C) Copyright Eddy Carroll, January 1991.
*
*	Replaces BltBitMap with a routine that uses the CPU (preferably
*	68030). This increases speed by a factor of about 2.8 on the A3000
*	when the cache is enabled or 2.0 when the cache is disabled.
*
*****************************************************************************

	include "exec/types.i"
	include "exec/execbase.i"
	include "exec/nodes.i"
	include "graphics/gfx.i"

	XDEF	_NewBltBitMap
	XDEF	_BltBitMapAddress
	XDEF	_OnlySingle
	XDEF	_UsageCount
	XDEF	_Broken
	XDEF	_BlitFunc
	XDEF	_MinTaskPri
	XDEF	_StartBlit
	XDEF	_ExitBlit
	XDEF	_ShareBlit
	XDEF	_Friend1
	XDEF	_Friend2

	XREF	_SysBase
	XREF	_GfxBase
	XREF	_LVOWait
	XREF	_LVOWaitBlit

	SECTION Scroll,CODE

*****************************************************************************
*
*	NewBltBitMap()
*
*	Replacement BltBitMap which uses the 68030 instead of the blitter.
*	The following conditions must hold for the CPU routine to be used:
*
*	    o	Bitmaps aligned on same longword bit offset
*		(i.e. XSrc % 32 == XDest % 32)
*
*	    o	If source bitmap == destination bitmap, then YSrc != YDest
*
*	    o	Blitter minterm = $Cx (i.e. straight copy)
*
*	If any of these conditions doesn't hold, then the original BltBitMap
*	is called instead.
*
*	Input:
*		D0 - X Source
*		D1 - Y Source
*		D2 - X Dest
*		D3 - Y Dest
*		D4 - X Size
*		D5 - Y Size
*		D6 - Minterm
*		D7 - Mask, indicating which planes are to be affected
*		A0 - Pointer to source bitmap structure
*		A1 - Pointer to destination bitmap structure
*		A2 - Pointer to temporary bitmap structure (not used)
*
*	Output:
*		D0 - Number of planes actually copied
*
*	The copy routine works as follows. Everything is done in longword
*	units. If the bitmap being copied fits horizontally into a single
*	longword, then the CopySingle() routine is used which copies a
*	single column of longwords, masked out as appropriate. Otherwise,
*	there are at least two longwords involved (the left and right edges
*	of the bitmap), with possibly some longwords inbetween as well.
*	CopyMultiple() is called to perform this copy; it uses two mask
*	values to identify which bits in the left and right longwords should
*	be copied. The longwords (if any) in between are copied verbatim.
*
*	Note that using longwords gives a big win on the A3000 since it can
*	access CHIP ram via the 32 bit bus. This relies on the data being
*	longword aligned of course. In the worst case (where a bitmap width
*	is not a multiple of 4), one out of every two rows will be longword
*	aligned, which is not too bad. In the more common case, every row
*	is longword aligned. For overscan users, it's best to have your
*	screen width a multiple of 32.
*
*****************************************************************************

PreExit:
	move.w	(sp)+,d0		; Restore original A0 register
	exg	d6,a0			; Ignore following instruction
_ExitBlit:
	exg	d6,a0			; Restore original A0 register
DoOldBlt:
	subq.l	#1,_UsageCount		; Decrement number of callers in code
oldblt2:
	move.l	(sp)+,d6		; Restore original mask
oldblit:
	jmp	dummy			; Filled in with correct address later

_BltBitMapAddress equ oldblit+2

dummy:	rts

_NewBltBitMap:
	tst.w	d4			; Check if width is zero
	beq.s	dummy			; If it is, don't do anything
	cmp.l	a0,a1			; Copying within the same bitmap?
	bne.s	nb1			; If yes,
	cmp.w	d1,d3			; and Y row is the same, then it's a
	beq.s	oldblit			; sideways blit so use system routine
	bra.s	nb2			; Else skip to next check
nb1:
	tst.l	_OnlySingle		; Should we handle different src/dest
	bne.s	oldblit			; If not, use standard system blit
nb2:
	move.l	d6,-(sp)		; Save current minterm register
	and.b	#$f0,d6			; Mask out low bits
	cmp.b	#$c0,d6			; Is it standard COPY minterm?
	bne.s	oldblt2			; If not, exit
	move.l	d0,d6			; See if XSrc % 32 == XDest % 32
	eor.l	d2,d6			; Low 5 bits should be zero if equal
	and.b	#$1f,d6			;
	bne.s	oldblt2			; If not, then have to do normal blit
	tst.l	_Broken			; Are we accomodating broken s/w?
	bne.s	nb3			; If so, skip checks
	tst.b	bm_Flags(a0)		; Is source standard Amiga bitmap?
	bne.s	oldblt2			; If not, use system blit routine
	tst.w	bm_Pad(a0)		; 
	bne.s	oldblt2			;
	tst.b	bm_Flags(a1)		; How about destination?
	bne.s	oldblt2			; If it isn't, use system blit
	tst.w	bm_Pad(a1)		; 
	bne.s	oldblt2			;
nb3:
	addq.l	#1,_UsageCount		; Increment usage count
	exg	d6,a0			; Save current A0
	move.l	_BlitFunc,a0		; Get pointer to appropriate test func
	jmp	(a0)			; And branch to it
;
;	Checks the usage count for the blitter code, to see if anyone else
;	is currently executing it. If so, use the blitter instead (hence
;	CPU does one blit while blitter does the other blit; multiprocessing!)
;
_ShareBlit:
	exg	d6,a0			; Restore old A0
	move.l	_UsageCount,d6		; Check if someone already in code
	bne	DoOldBlt		; If there is, use blitter instead
	bra.s	sblit2			; Else skip to use CPU
;
;	Checks to see if there is more than one task ready to run. If so,
;	use the blitter, else use the CPU. Note that for the most common case
;	of scrolling (in a CLI/console window), the task outputting the text
;	that causes the scroll will be "Ready to Run" since it is pre-empted
;	by the console device before it has a chance to go into a Wait
;	condition.
;
;	If there is more than one task ready to run, but the second task
;	in the queue has priority < MinTaskPri, then we can use the CPU
;	anyway (since the second task is a background task that can be
;	ignored).
;
_Friend2:
	move.l	_SysBase,a0		; Get pointer to ExecBase
	lea.l	TaskReady(a0),a0	; Get ptr to TaskReady list
	cmp.l	8(a0),a0		; Empty list?
	beq.s	_StartBlit		; If yes, do blit
	move.w	d0,-(sp)		; Grab a register temporarily
	move.l	(a0),a0			; Get pointer to first waiting task
	move.l	(a0),a0			; Get pointer to second task
	move.b	LN_PRI(a0),d0		; Get its priority (if it exists)
	move.l	(a0),a0			; And final link ptr (NULL if at end)
	exg	d6,a0			; Restore previous A0
	tst.l	d6			; More than 1 task?
	beq.s	F2Okay			; If no, we can use the blitter anyway
	cmp.b	_MinTaskPri,d0		; Should we make way for waiting task?
	bge	PreExit			; If so, use blitter instead
F2Okay:
	move.w	(sp)+,d0		; Else restore D0
	bra.s	sblit2			; And skip to start blit
;
;	Checks to see if there are _any_ other tasks ready to run. If there
;	are and their task priority is >= MinTaskPri, then uses system blit
;	instead of CPU.
;
_Friend1:
	move.l	_SysBase,a0		; Get pointer to ExecBase
	lea.l	TaskReady(a0),a0	; Get ptr to TaskReady list, head node
	cmp.l	8(a0),a0		; Empty list?
	beq.s	_StartBlit		; If yes, we can safely blit
	move.l	(a0),a0			; Get pointer to first task
	move.w	LN_TYPE(a0),a0		; Read ln_Type and ln_Pri
	exg	d6,a0			; Restore a0
	cmp.b	_MinTaskPri,d6		; Should we ignore this task?
	bge	DoOldBlt		; If not, then use blitter instead
	bra.s	sblit2			; Else skip to use CPU

;----------------------------------------------------------------------------
;	Where the action starts. Initialises everything and then performs
;	the blits using the CPU. At this stage, all registers are exactly
;	as they were on entry to BltBitMap, except for D6 and A0, and these
;	two are restored to the correct values immediately on entry.
;----------------------------------------------------------------------------
_StartBlit:
	exg	d6,a0			; Restore A0
sblit2:					; Alternative entry point
;
;	Now we need to determine the masks to be used for clipping, along
;	with the start address in memory of the section of the bit and
;	the modulo of each bitplane (the amount added onto the end of each
;	copied row address to get to the start of the next one). Then loop
;	over all the selected bitplanes, copying those requested.
;
	movem.l	d1-d5/d7/a0-a6,-(sp)	; Save rest of the registers
;
;	Next, we need to make sure that the blitter is free. This is because
;	some other blitter operation that operates on the bitmaps we've been
;	passed may have started but not yet finished. Operations that
;	depend on the blitter are guaranteed to occur in the right order
;	(since the blitter can't multitask with itself) but when we start
;	doing some of them with the CPU, we need to be a bit more careful.
;
;	Note: Since we are now "in" graphics.library, a6 holds GfxBase.
;	      WaitBlit() is documented as preserving all registers.
;
	jsr	_LVOWaitBlit(a6)	; Wait for blitter to become free
	ext.l	d0			; Convert all parameters to long
	ext.l	d1			;
	ext.l	d2			;
	ext.l	d3			;
	ext.l	d4			;
	ext.l	d5			;

	cmp	d1,d3			; See if we are scrolling up or down
	bhi	bltdown			;
;
;	Since YDest < YSrc, we are copying the bitmap upwards in memory
;	therefore start at the beginning and work down. (This is only
;	important if the source and destination bitmaps are the same, but
;	it doesn't do any harm to check when they are different also.)
;
bltup:
	move.w	bm_BytesPerRow(a0),d6	; Get width of source bitmap
	ext.l	d6			; Extend to full integer
	move.l	d6,a2			; Initialise modulo for source bitmap
	muls	d6,d1			; Calculate row offset
	move.l	d0,d6			; Get XSrc
	lsr.l	#3,d6			; Get #bytes offset of XSrc
	and.b	#$fc,d6			; Adjust to longword boundary
	add.l	d6,d1			; Add on x offset to get bitmap offset
	move.l	d1,a4			; Now A4 = offset into source bitmap
;
;	Repeat for dest bitmap
;
	move.w	bm_BytesPerRow(a1),d6	; Get width of dest bitmap
	ext.l	d6			; Extend to full integer
	move.l	d6,a3			; Initialise modulo for dest bitmap
	muls	d6,d3			; Calculate row offset
	move.l	d2,d6			; Get XDest
	lsr.l	#3,d6			; (Converted to longword aligned
	and.b	#$fc,d6			; byteoffset)
	add.l	d6,d3			; Add on xoffset to get bitmap offset
	move.l	d3,a5			; Now A5 = offset into dest bitmap
	bra.s	contblit		; Skip to rest of blitcopy
;
;	If we get here, YDest > YSrc, so we are copying the bitmap downwards
;	which means we need to start from the end and work back. We also
;	need to initialise the modulo to -BytesPerRow instead of BytesPerRow.
;
bltdown:
	add.l	d5,d1			; Add YSize+YSrc to get last row addr
	subq.l	#1,d1			; Adjust (so we don't have last_row+1)
	move.w	bm_BytesPerRow(a0),d6	; Get width of source bitmap
	ext.l	d6			; Extend to full longword
	muls	d6,d1			; Calculate row offset
	neg.l	d6			; Negate mod. since copying backwards
	move.l	d6,a2			; Initialise modulo for source bitmap
	move.l	d0,d6			; Get XSrc
	lsr.l	#3,d6			; Get #bytes offset of XSrc
	and.b	#$fc,d6			; Adjust to longword boundary
	add.l	d6,d1			; Add on x offset to get bitmap offset
	move.l	d1,a4			; Now A4 = offset into source bitmap
;
;	Do same calculations for dest bitmap
;
	add.l	d5,d3			; Add YSize+YSrc to get last row addr
	subq.l	#1,d3			; Adjust (so we don't have last_row+1)
	move.w	bm_BytesPerRow(a1),d6	; Get width of dest bitmap
	ext.l	d6			; Extend to full longword
	muls	d6,d3			; Calculate row offset
	neg.l	d6			; Negate, since copying backwards
	move.l	d6,a3			; Initialise modulo for dest bitmap
	move.l	d2,d6			; Get XDest
	lsr.l	#3,d6			; (Converted to longword aligned
	and.b	#$fc,d6			; byteoffset)
	add.l	d6,d3			; Add on xoffset to get bitmap offset
	move.l	d3,a5			; Now A5 = offset into dest bitmap
;
;	Now calculate the mask values
;
contblit:
	and.w	#$1f,d0			; Calculate XSrc longword bit offset
	add.l	d0,d4			; Calculate width of bitmap
	move.l	d4,d1			; Calculate longword bit offset
	and.w	#$1f,d1			;
	lsr.l	#5,d4			; Calc # of longwords needed for copy
	add.l	d1,d1			; Scale XWidth bits to longword index
	add.l	d1,d1			; into the bitmask array
	bne.s	contb1			; If zero,
	subq.l	#1,d4			; Decrement longword count
contb1:
	lea	RightMask(PC),a6	; Get address of right mask table
	move.l	0(a6,d1.w),d2		; Get right bitmask
	add.l	d0,d0			; Scale XSrc bits to longword index
	add.l	d0,d0			; And again
contb2:
	lea	LeftMask(PC),a6		; Get address of left mask table
	move.l	0(a6,d0.w),d1		; Get left bitmask
;
;	Calculate minimum number of bitplanes to copy
;
	moveq.l	#0,d6			; Zero out high bits of D6
	move.b	bm_Depth(a0),d6		; Get depth of source bitmap
	cmp.b	bm_Depth(a1),d6		; If greater than that of dest bitmap
	blo.s	contb3			;
	move.b	bm_Depth(a1),d6		; Use dest bitmap depth instead
contb3:
	subq.l	#1,d6			; Adjust depth to 0-based, not 1-based
	move.l	d4,d0			; Copy longword count
	addq.l	#1,d0			; Adjust positively
	add.l	d0,d0			; Convert longword count to byte count
	add.l	d0,d0			;
	sub.l	d0,a2			; Calculate correct modulo for source
	sub.l	d0,a3			; Calculate correct modulo for dest.
	exg	a2,a4			; Setup A2/A3 = bitmap offsets
	exg	a3,a5			;  and  A4/A5 = bitmap modulos

	subq.l	#1,d4			; Adjust longword count to zero-based
	move.l	d4,d3			; Move to right reg for Copy routine
	move.l	d5,d0			; Copy YSize to right place also
	lea.l	bm_Planes(a0),a0	; Get pointer to source bitplane array
	lea.l	bm_Planes(a1),a1	; Get pointer to dest bitplane array
	move.l	a0,d4			; Stash bitplane pointers here
	move.l	a1,d5			;
	move.l	20(sp),d7		; Read plane mask value from stack
;
;	Now build a list of bitmaps to be copied on the stack. To this end,
;	we reserve 8 * 8 = 64 bytes of stack for source/destination bitmap
;	pointers.
;
	lea	-64(sp),sp		; Reserve space for bitmap ptrs
	move.l	sp,a6			; And point to it using A6
;
;	Loop through bitmaps building bitmap list for bitmaps specified in
;	the copy mask. Bitplanes which have source and/or destination bitmaps
;	set to NULL or -1 get handled immediately (new for WB 2.0). All others
;	get stored on the stack.
;
	move.w	d7,-(sp)		; Save plane mask as temporary value
	moveq.l	#0,d7			; Clear bitmap plane count
cmultlp:
	lsr.w	(sp)			; See if need to copy this bitplane
	bcc.s	cmultx			; If not, skip over code
	addq	#1,d7			; Increment number of bitmaps copied
	move.l	d4,a0			; Get pointer to source bitplane ptr
	move.l	d5,a1			; And destination bitplane ptr
	move.l	(a0),d4			; Read pointers to bitplanes
	move.l	(a1),d5			;
	not.l	d5			; Check if dest is -1
	beq	skipfill		; If so, don't copy anything
	not.l	d5			; Check if dest is zero
	beq	skipfill		; If so, don't copy anything
	not.l	d4			; Check if source is -1
	beq	fillones		; If so, fill dest with 1's
	not.l	d4			; Check if source is 0
	beq	fillzeros		; If so, fill dest with 0's
	exg	d4,a0			; Put registers back in right place
	exg	d5,a1			;
	add.l	a2,a0			; Add in correct offset for src ptr
	add.l	a3,a1			; Add in correct offset for dest ptr
	move.l	a0,(a6)+		; Store bitmap pointers on the stack
	move.l	a1,(a6)+		; 	
cmultx:
	addq.l	#4,d4			; Bump bitplane pointers
	addq.l	#4,d5			;
	dbf	d6,cmultlp		; Repeat for remaining bitplanes
	addq.l	#2,sp			; Pop plane mask from stack
;
;	Now copy all the bitmaps we accumulated on the stack. There will be
;	between 1 and 8 of them. We copy them in groups of 1 to 4, so two
;	operations may be required.
;
;	A quick recap on what the various registers contain:
;
;	    D0 - Number of rows to copy
;	    D1 - Mask for left edge of bitmap
;	    D2 - Mask for right edge of bitmap
;	    D3 - Number of longwords _between_ left edge and right edge
;	    D7 - Total number of bitplanes copied (including 0 & -1 ptrs)
;	    A4 - Modulo of source bitplanes
;	    A5 - Modulo of dest bitplanes
;	    A6 - Points to end of source/dest bitplane pointers
;	    SP - Points to start of source/dest bitplane pointers
;
	sub.l	sp,a6			; Calculate how many bitplanes to copy
	move.l	a6,d6			; Equals half # of source/dest pairs
	lsr.l	#1,d6			; (giving a range of 0-28)
	subq	#4,d6			; Adjust to zero based
	bpl.s	cmultx2			; If negative, no bitplanes to copy
	lea	64(sp),sp		; so pop bitplane pointers from stack
	bra	doneblt			; and exit without doing any work
cmultx2:
	cmpi.w	#12,d6			; More than 4 bitplanes to copy?
	bhi.s	cmult_db		; If so, skip to do in two goes
	move.l	d3,d3			; Does bitmap fits in one longword?
	bpl.s	cmult_mm		; If not, skip to multiple longwords
;
;	We have between 1 and 4 bitplanes to copy, each a single
;	longword wide.
;
	and.l	d2,d1			; Create composite mask
	addq	#8,d6			; Adjust to index CopySingle() entries
	addq	#8,d6			; and then fall through.
;
;	We have between 1 and 4 bitplanes to copy, each at least two
;	longwords wide.
;
cmult_mm:
	move.l	FuncTab(pc,d6),a6	; Else call appropriate routine
	jsr	(a6)			;
	lea	64(sp),sp		; Pop everything off the stack
	bra	doneblt			; And skip to end of blit

cmult_db:
	move.l	d3,d3			; Does bitplane fit in one longword?
	bpl.s	cmult_dbm		; If not, skip to multiple copy
;
;	We have between 5 and 8 bitplanes to copy, each just one
;	longword wide. Note that when we exit, we branch into the code to
;	copy the remaining bitmaps, but with the function index pointing
;	into the CopySingle() entries rather than CopyMultiple()
;
	and.l	d2,d1			; Create composite mask
	bsr	Copy4Single		; Copy first four bitplanes
	bra.s	cmult_dbm2		; Skip to exit with correct fn index
;
;	We have between 5 and 8 bitplanes to copy, each at least two
;	longwords wide.
;
cmult_dbm:
	bsr	Copy4Multiple		; Copy first four bitmaps in one gulp
	subi.w	#16,d6			; Adjust bitmap count
cmult_dbm2:
	lea	32(sp),sp		; Pop first four bitmaps off stack
	move.l	FuncTab(pc,d6),a6	; Copy remaining bitmaps
	jsr	(a6)			;
	lea	32(sp),sp		; Pop remaining bitmaps
	bra	doneblt			; And skip to end of blit

;
;	Index to table of functions for copying from 1 to 4 multiple and
;	single longword bitmaps.
;
FuncTab:
	dc.l	Copy1Multiple,Copy2Multiple,Copy3Multiple,Copy4Multiple
	dc.l	Copy1Single,Copy2Single,Copy3Single,Copy4Single

;
;	Skip past current bitplane without doing anything to bitplane data
;	(used when destination bitmap ptr is 0 or -1).
;
skipfill:
	exg	d4,a0			; Restore original pointers
	exg	d5,a1			;
	bra	cmultx			; Skip back to do next bitplane
;
;	Fill bitplane with one's (source bitplane pointer is -1)
;
fillones:
	exg	d4,a0			; Restore register order
	exg	d5,a1			;
	add.l	a3,a1			; Add in correct offset into bitplane
	bsr	Fill_1s			; Fill the bitplane
	bra	cmultx			; Skip back to do next bitplane
;
;	Fill bitplane with zero's (source bitplane pointer is NULL)
;
fillzeros:
	exg	d4,a0			; Restore register order
	exg	d5,a1			;
	add.l	a3,a1			; Add in correct offset into bitplane
	bsr	Fill_0s			; Fill the bitplane
	bra	cmultx			; Skip back to do next bitplane
;
;	That's it -- we're done! Now just pop remaining values off the stack
;	and return to the caller with d0 = number of bitplanes copied.
;
doneblt:
	move.l	d7,d0			; Set return value = #bitplanes copied
	subq.l	#1,_UsageCount		; Decrement number of callers in code
	movem.l	(sp)+,d1-d5/d7/a0-a6	; Restore registers
	move.l	(sp)+,d6		; And this one too
	rts				; Return to caller

*****************************************************************************
*
*	CopyMultiple()
*
*	The following routines copy from 1 to 4 bitplanes which span more
*	than one longword boundary horizontally (i.e. the start and finish
*	bitplanes are in different longwords).
*
*	The routines are constructed mainly out of macros, to keep the source
*	code down to size (and also more manageable). All routines take the
*	following parameters:
*
*	Input:
*		D0 - Number of rows to copy
*		D1 - Mask for left edge of source	(000xxx)
*		D2 - Mask for right edge of source	(xxx000)
*		D3 - Number of longwords to copy
*	        A4 - Modulo of source (positive or negative)
*	        A5 - Modulo of destination (positive or negative)
*
*	In addition, pointers to the source/destination bitplanes are pushed
*	onto the stack, such that 4(SP) = src bp1, 8(SP) = dest bp1,
*	12(SP) = src bp2, 16(SP) = dest bp2 etc.
*
*	Output:
*		None
*
*****************************************************************************

*****************************************************************************
*
*	Macros used by the copy routines
*
*****************************************************************************

;-----------------------------------------------------------------------------
;	Init_Mult Label
;
;	This macro is the standard entry to each CopyMultiple() routine. It
;	checks to see whether the bitplane being copied contains at least
;	one full longword. If not, it branches to a separate routine
;	(loop?edges) which is smaller; doing this at the start saves having
;	to check for zero longwords each time through the main loop.
;	Label is the name of the routine to perform the separate copy.
;-----------------------------------------------------------------------------

Init_Mult macro
	subq.l	#1,d0			; Convert row count to zero-based
	move.l	d1,d4			; Copy left source mask
	not.l	d4			; And change it into destination mask
	move.l	d2,d5			; Copy right source mask
	not.l	d5			; Change into destination mask
	subq.l	#1,d3			; Adjust longword count to zero based
	bmi	\1			; If none to copy use seperate routine
	endm

;-----------------------------------------------------------------------------
;	Left_Mult   src,dest
;
;	Copies the left hand side of the bitplane from register src to the
;	bitplane pointed to by dest, using the masks in d1/d4
;-----------------------------------------------------------------------------

Left_Mult macro
	move.l	(\1)+,d6		; Read leftmost longword of source
	and.l	d1,d6			; Mask out bits not to be copied
	move.l	(\2),d7			; Read leftmost longword of dest
	and.l	d4,d7			; Mask out bits to remain the same
	or.l	d6,d7			; Merge source and dest columns
	move.l	d7,(\2)+		; Output first word of bitplane again
	endm

;-----------------------------------------------------------------------------
;	Copy_Mult src,dest
;
;	Copies all the full longwords between the left and right extremities
;	of the bitplane row from src to dest. Note that for 68010 upwards, it
;	is faster to copy using MOVE.L/DBF than to play tricks with MOVEM;
;	since this program will only be of use to systems with fast CPU's
;	anyway, this is the route we take.
;-----------------------------------------------------------------------------

Copy_Mult macro
	move.l	d3,d6			; Copy longword count into scratch reg
loop_m\@:
	move.l	(\1)+,(\2)+		; Copy longwords
	dbf	d6,loop_m\@		;
	endm

;-----------------------------------------------------------------------------
;	Right_Mult   src,dest
;
;	Copies the right hand side of the bitplane from register src to the
;	bitplane pointed to by dest, using the masks in d2/d5
;-----------------------------------------------------------------------------

Right_Mult macro
	move.l	(\1)+,d6		; Read rightmost longword of source
	and.l	d2,d6			; Mask out bits not being copied
	move.l	(\2),d7			; Read rightmost longword of dest
	and.l	d5,d7			; Mask out bits to remain the same
	or.l	d6,d7			; Merge source and dest columns
	move.l	d7,(\2)+		; Output right longword again
	endm

;-----------------------------------------------------------------------------
;	Advance src,dest
;
;	This macro advances the source and destination pointers to point to
;	the next row in the bitplane.
;-----------------------------------------------------------------------------

Advance macro
	add.l	a4,\1			; Increment source pointer
	add.l	a5,\2			; Increment dest pointer
	endm

;-----------------------------------------------------------------------------
;	Copy_Quick src,dest
;
;	This macro copies the left and right edges in one go, when there
;	are no complete longwords in between. It's quicker than having to
;	check for zero longwords each time through the main loop. The masks
;	used are d1/d4 for the left edge of the bitplane, d2/d5 for the
;	right edge.
;-----------------------------------------------------------------------------

Copy_Quick macro
	move.l	(\1)+,d6		; Read leftmost longword of source
	and.l	d1,d6			; Mask out bits not to be copied
	move.l	(\2),d7			; Read leftmost longword of dest
	and.l	d4,d7			; Mask out bits to remain the same
	or.l	d6,d7			; Merge source and dest columns
	move.l	d7,(\2)+		; Output first word of bitplane again
;
;	Now tidy up right hand edge of bitplane
;
	move.l	(\1)+,d6		; Read rightmost longword of source
	and.l	d2,d6			; Mask out bits not being copied
	move.l	(\2),d7			; Read rightmost longword of dest
	and.l	d5,d7			; Mask out bits to remain the same
	or.l	d6,d7			; Merge source and dest columns
	move.l	d7,(\2)+		; Output right longword again
	endm

*****************************************************************************
*
*	The actual copy routines, Copy1Multiple() ... Copy4Multiple()
*
*****************************************************************************

;-----------------------------------------------------------------------------
;
;	Copies a single bitplane
;
;-----------------------------------------------------------------------------

Copy1Multiple:
	movem.l	a0-a1/a6/d0-d7,-(sp)	; Save registers
	lea.l		48(sp),a6	; Get pointer to stack
	move.l		(a6)+,a0	; Read bitplane pointers from stack
	move.l		(a6)+,a1	; Read bitplane pointers from stack
	Init_Mult	Copy1Quick	; Setup registers
c1m_loop:
	Left_Mult	a0,a1		; Copy left edge of bitplane
	Copy_Mult	a0,a1		; Copy middle of bitplane
	Right_Mult	a0,a1		; Copy right edge of bitplane
	Advance		a0,a1		; Increment bitplane ptrs
	dbf		d0,c1m_loop	; Repeat for remaining rows
	movem.l	(sp)+,a0-a1/a6/d0-d7	; Restore registers
	rts				; Return to caller
;
;	Handle inner longword count of zero
;
Copy1Quick:
	Copy_Quick	a0,a1		; Copy left/right edge of bitplane
	Advance		a0,a1		; Increment bitplane ptrs
	dbra		d0,Copy1Quick	; Repeat for all rows
	movem.l	(sp)+,a0-a1/a6/d0-d7	; Restore registers
	rts				; Return to caller

;-----------------------------------------------------------------------------
;
;	Copies 2 bitplanes simultaneously
;
;-----------------------------------------------------------------------------

Copy2Multiple:
	movem.l	a0-a3/a6/d0-d7,-(sp)	; Save registers
	lea.l		56(sp),a6	; Get pointer to bitplanes
	movem.l		(a6),a0-a3	; Load bitplane ptrs off stack
	Init_Mult	Copy2Quick	; Setup registers
c2m_loop:
	Left_Mult	a0,a1		; Copy left edge of bitplane 1
	Left_Mult	a2,a3		; Copy left edge of bitplane 2
	Copy_Mult	a0,a1		; Copy middle of bitplane 1
	Copy_Mult	a2,a3		; Copy middle of bitplane 2
	Right_Mult	a0,a1		; Copy right edge of bitplane 1
	Right_Mult	a2,a3		; Copy right edge of bitplane 2
	Advance		a0,a1		; Increment bitplane 1 ptrs
	Advance		a2,a3		; Increment bitplane 2 ptrs
	dbf		d0,c2m_loop	; Repeat for remaining rows
	movem.l	(sp)+,a0-a3/a6/d0-d7	; Restore registers
	rts				; Return to caller
;
;	Handle inner longword count of zero
;
Copy2Quick:
	Copy_Quick	a0,a1		; Copy left/right edge of bitplane 1
	Copy_Quick	a2,a3		; Copy left/right edge of bitplane 2
	Advance		a0,a1		; Increment bitplane 1 ptrs
	Advance		a2,a3		; Increment bitplane 2 ptrs
	dbra		d0,Copy2Quick	; Repeat for all rows
	movem.l	(sp)+,a0-a3/a6/d0-d7	; Restore registers
	rts				; Return to caller

;-----------------------------------------------------------------------------
;
;	Copies 3 bitplanes simultaneously
;
;-----------------------------------------------------------------------------

Copy3Multiple:
	movem.l	a0-a3/a6/d0-d7,-(sp)	; Save registers
	lea.l		56(sp),a6	; Get pointer to bitplanes
	movem.l		(a6)+,a0-a3	; Load bitplane ptrs 1 & 2 off stack
	Init_Mult	Copy3Quick	; Setup registers
c3m_loop:
	Left_Mult	a0,a1		; Copy left edge of bitplane 1
	Left_Mult	a2,a3		; Copy left edge of bitplane 2
	Copy_Mult	a0,a1		; Copy middle of bitplane 1
	Copy_Mult	a2,a3		; Copy middle of bitplane 2
	Right_Mult	a0,a1		; Copy right edge of bitplane 1
	Right_Mult	a2,a3		; Copy right edge of bitplane 2
	Advance		a0,a1		; Increment bitplane 1 ptrs
	Advance		a2,a3		; Increment bitplane 2 ptrs
	move.l		a3,-(sp)	; Save bitplane 2 ptrs
	move.l		a2,-(sp)	;
	move.l		(a6)+,a2	; Load bitplane 3 ptrs
	move.l		(a6),a3		; 
	Left_Mult	a2,a3		; Copy left edge of bitplane 3
	Copy_Mult	a2,a3		; Copy middle of bitplane 3
	Right_Mult	a2,a3		; Copy right edge of bitplane 3
	Advance		a2,a3		; Increment bitplane 3 ptrs
	move.l		a3,(a6)		; Save bitplane 3 ptrs
	move.l		a2,-(a6)	;
	move.l		(sp)+,a2	; Restore bitplane 2 ptrs
	move.l		(sp)+,a3	;
	dbf		d0,c3m_loop	; Repeat for remaining rows
	movem.l	(sp)+,a0-a3/a6/d0-d7	; Restore registers
	rts				; Return to caller
;
;	Handle inner longword count of zero
;
Copy3Quick:
	Copy_Quick	a0,a1		; Copy left/right edge of bitplane 1
	Copy_Quick	a2,a3		; Copy left/right edge of bitplane 2
	Advance		a0,a1		; Increment bitplane 1 ptrs
	Advance		a2,a3		; Increment bitplane 2 ptrs
	move.l		a3,-(sp)	; Save bitplane 2 ptrs
	move.l		a2,-(sp)	;
	move.l		(a6)+,a2	; Load bitplane 3 ptrs
	move.l		(a6),a3		; 
	Copy_Quick	a2,a3		; Copy left/right edge of bitplane 2
	Advance		a2,a3		; Increment bitplane 2 ptrs
	move.l		a3,(a6)		; Save bitplane 3 ptrs
	move.l		a2,-(a6)	;
	move.l		(sp)+,a2	; Restore bitplane 2 ptrs
	move.l		(sp)+,a3	;
	dbra		d0,Copy3Quick	; Repeat for all rows
	movem.l	(sp)+,a0-a3/a6/d0-d7	; Restore registers
	rts				; Return to caller

;-----------------------------------------------------------------------------
;
;	Copies 4 bitplanes simultaneously
;
;-----------------------------------------------------------------------------

Copy4Multiple:
	movem.l	a0-a3/a6/d0-d7,-(sp)	; Save registers
	lea.l		56(sp),a6	; Get pointer to bitplanes
	movem.l		(a6)+,a0-a3	; Load bitplane ptrs 1 & 2 off stack
	Init_Mult	Copy4Quick	; Setup registers
c4m_loop:
	Left_Mult	a0,a1		; Copy left edge of bitplane 1
	Left_Mult	a2,a3		; Copy left edge of bitplane 2
	Copy_Mult	a0,a1		; Copy middle of bitplane 1
	Copy_Mult	a2,a3		; Copy middle of bitplane 2
	Right_Mult	a0,a1		; Copy right edge of bitplane 1
	Right_Mult	a2,a3		; Copy right edge of bitplane 2
	Advance		a0,a1		; Increment bitplane 1 ptrs
	Advance		a2,a3		; Increment bitplane 2 ptrs
	movem.l		a0-a3,-(sp)	; Save bitplane 2 ptrs
	movem.l		(a6),a0-a3	; Load bitplane 3 ptrs
	Left_Mult	a0,a1		; Copy left edge of bitplane 1
	Left_Mult	a2,a3		; Copy left edge of bitplane 2
	Copy_Mult	a0,a1		; Copy middle of bitplane 1
	Copy_Mult	a2,a3		; Copy middle of bitplane 2
	Right_Mult	a0,a1		; Copy right edge of bitplane 1
	Right_Mult	a2,a3		; Copy right edge of bitplane 2
	Advance		a0,a1		; Increment bitplane 1 ptrs
	Advance		a2,a3		; Increment bitplane 2 ptrs
	movem.l		a0-a3,(a6)	; Save bitplane 3 ptrs
	movem.l		(sp)+,a0-a3	; Restore bitplane 2 ptrs
	dbf		d0,c4m_loop	; Repeat for remaining rows
	movem.l	(sp)+,a0-a3/a6/d0-d7	; Restore registers
	rts				; Return to caller
;
;	Handle inner longword count of zero
;
Copy4Quick:
	Copy_Quick	a0,a1		; Copy left/right edge of bitplane 1
	Copy_Quick	a2,a3		; Copy left/right edge of bitplane 2
	Advance		a0,a1		; Increment bitplane 1 ptrs
	Advance		a2,a3		; Increment bitplane 2 ptrs
	movem.l		a0-a3,-(sp)	; Save bitplane 1,2 ptrs
	movem.l		(a6),a0-a3	; Load bitplane 3,4 ptrs
	Copy_Quick	a0,a1		; Copy left/right edge of bitplane 3
	Copy_Quick	a2,a3		; Copy left/right edge of bitplane 4
	Advance		a0,a1		; Increment bitplane 3 ptrs
	Advance		a2,a3		; Increment bitplane 4 ptrs
	movem.l		a0-a3,(a6)	; Save bitplane 3,4 ptrs
	movem.l		(sp)+,a0-a3	; Restore bitplane 1,2 ptrs
	dbra		d0,Copy4Quick	; Repeat for all rows
	movem.l	(sp)+,a0-a3/a6/d0-d7	; Restore registers
	rts				; Return to caller

*****************************************************************************
*
*	CopySingle()
*
*	The following routines copy from 1 to 4 bitplanes that start and end
*	(horizontally) within a single longword. CopyMultiple can't be used
*	for such cases, since it always copies at least two longwords (one
*	for the left edge and one for the right).
*
*	Input:
*		D0 - Number of rows to copy
*		D1 - Mask of bits to be copied from source  (000xxx000)
*		A4 - Modulo of source bitplane
*		A5 - Modulo of dest bitplane
*
*	In addition, pointers to the source/destination bitplanes are pushed
*	onto the stack, such that 4(SP) = src bp1, 8(SP) = dest bp1,
*	12(SP) = src bp2, 16(SP) = dest bp2 etc.
*
*	Output:
*		None
*
*****************************************************************************

*****************************************************************************
*
*	Macros used by the copy routines
*
*****************************************************************************

;-----------------------------------------------------------------------------
;	Init_Sing
;
;	This macro is the standard entry to each CopySingle() routine. It
;	creates the complement mask used for masking source/destination
;	and adjusts the row counter to be zero based.
;-----------------------------------------------------------------------------

Init_Sing macro
	subq.l	#1,d0			; Adjust row count to zero-based
	move.l	d1,d2			; Copy mask
	not.l	d2			; And make mask for dest bitplane
	endm

;-----------------------------------------------------------------------------
;	Copy_Dual src,dest
;
;	Copies the source longword from src to dest, masked with the value
;	in D2/D4
;-----------------------------------------------------------------------------

Copy_Dual macro
	move.l	(\1)+,d3		; Read src word
	and.l	d1,d3			; Mask out unwanted bits
	move.l	(\2),d4			; Read dest word
	and.l	d2,d4			; Mask out bits to be replaced
	or.l	d3,d4			; Combine src and dest bits
	move.l	d4,(\2)+		; Replace destination word
	endm

*****************************************************************************
*
*	The actual copy routines, Copy1Single() ... Copy4Single()
*
*****************************************************************************

;-----------------------------------------------------------------------------
;
;	Copies a single bitplane one longword wide
;
;-----------------------------------------------------------------------------

Copy1Single:
	movem.l	a0-a1/a6/d0-d4,-(sp)	; Save registers
	lea		36(sp),a6	; Get pointer to bitplane
	move.l		(a6)+,a0	; Get bitplane pointers into registers
	move.l		(a6),a1		;
	Init_Sing			; Initialise masks etc.
copy1slp:
	Copy_Dual	a0,a1		; Copy longword
	Advance		a0,a1		; Move to next longword
	dbra		d0,copy1slp	; Repeat for all rows
	movem.l	(sp)+,a0-a1/a6/d0-d4	; Restore registers
	rts

;-----------------------------------------------------------------------------
;
;	Copies two bitplanes, each one longword wide
;
;-----------------------------------------------------------------------------

Copy2Single:
	movem.l	a0-a3/a6/d0-d4,-(sp)	; Save registers
	lea		44(sp),a6	; Get ptr to bitplane
	movem.l		(a6)+,a0-a3	; Get bitplane ptrs into registers
	Init_Sing			; Initialise masks etc.
copy2slp:
	Copy_Dual	a0,a1		; Copy longword for bitplane 1
	Copy_Dual	a2,a3		; Copy longword for bitplane 2
	Advance		a0,a1		; Advance bitplane 1 ptrs
	Advance		a2,a3		; Advance bitplane 2 ptrs
	dbra		d0,copy2slp	; Repeat for all rows
	movem.l	(sp)+,a0-a3/a6/d0-d4	; Restore registers
	rts

;-----------------------------------------------------------------------------
;
;	Copies three bitplanes, each one longword wide
;
;-----------------------------------------------------------------------------

Copy3Single:
	movem.l	a0-a3/a6/d0-d4,-(sp)	; Save registers
	lea		44(sp),a6	; Get pointer to bitplane
	movem.l		(a6)+,a0-a3	; Get bitplane ptrs into registers
	Init_Sing			; Initialise masks etc.
copy3slp:
	Copy_Dual	a0,a1		; Copy longword for bitplane 1
	Copy_Dual	a2,a3		; Copy longword for bitplane 2
	Advance		a0,a1		; Advance bitplane 1 ptrs
	Advance		a2,a3		; Advance bitplane 2 ptrs
	move.l		a1,-(sp)	; Save bitplane 2 ptrs
	move.l		a0,-(sp)	;
	move.l		(a6)+,a0	; Load bitplane 3 ptrs
	move.l		(a6),a1		; 
	Copy_Dual	a0,a1		; Copy longword for bitplane 3
	Advance		a0,a1		; Advance bitplane 3 ptrs
	move.l		a1,(a6)		; Save bitplane 3 ptrs
	move.l		a0,-(a6)	;
	move.l		(sp)+,a0	; Restore bitplane 2 ptrs
	move.l		(sp)+,a1	;
	dbra		d0,copy3slp	; Repeat for all rows
	movem.l	(sp)+,a0-a3/a6/d0-d4	; Restore registers
	rts

;-----------------------------------------------------------------------------
;
;	Copies four bitplanes, each one longword wide
;
;-----------------------------------------------------------------------------

Copy4Single:
	movem.l	a0-a3/a6/d0-d4,-(sp)	; Save registers
	lea		44(sp),a6	; Get pointer to bitplane pointers
	movem.l		(a6)+,a0-a3	; Get bitplane pointers into registers
	Init_Sing			; Initialise masks etc.
copy4slp:
	Copy_Dual	a0,a1		; Copy longword for bitplane 1
	Copy_Dual	a2,a3		; Copy longword for bitplane 2
	Advance		a0,a1		; Advance bitplane 1 ptrs
	Advance		a2,a3		; Advance bitplane 2 ptrs
	movem.l		a0-a3,-(sp)	; Save bitplane 1 and 2 ptrs on stack
	movem.l		(a6),a0-a3	; Read bitplane 3 and 4 ptrs
	Copy_Dual	a0,a1		; Copy longword for bitplane 3
	Copy_Dual	a2,a3		; Copy longword for bitplane 4
	Advance		a0,a1		; Advance bitplane 3 ptrs
	Advance		a2,a3		; Advance bitplane 4 ptrs
	movem.l		a0-a3,(a6)	; Save bitplane 3 and 4 ptrs
	movem.l		(sp)+,a0-a3	; Restore bitplane 1 and 2 ptrs
	dbra		d0,copy4slp	; Repeat for all rows
	movem.l	(sp)+,a0-a3/a6/d0-d4	; Restore registers
	rts

*****************************************************************************
*
*	Fill_1s(), Fill_0s
*
*	Handles the case new for Workbench 2.0 where the source bitplane
*	pointer points to an array of all ones (ptr = $FFFFFFFF) or all
*	zeros ($ptr = $00000000).
*
*	    Input:
*		D0 - Number of rows to copy
*		D1 - Mask for left edge of source	(000xxx)
*		D2 - Mask for right edge of source	(xxx000)
*		D3 - Number of longwords to copy (-1 means single column)
*		A1 - Pointer to dest bitplane
*		A5 - Modulo of dest bitplane
*
*	    Output:
*		None
*
*****************************************************************************

Fill_1s:
	movem.l	d0/d3/d6-d7/a1,-(sp)	; Save registers
	moveq.l	#-1,d7			; Set register ready for fills
	subq.l	#1,d0			; Adjust row count to zero-based
	move.l	d3,d3			; Check how many longwords to copy
	bmi.s	Fill_1single		; Branch if only a single longword
	subq	#1,d3			; Adjust longword count to zero based
	bmi.s	Fill_1quick		; If no complete longwords, handle
;
;	We have more than two longwords to copy, so loop over them all.
;
fill_1lp1:
	or.l	d1,(a1)+		; Set bits on left edge of bitplane
	move.l	d3,d6			; Get number of longwords to fill
fill_1lp2:
	move.l	d7,(a1)+		; Fill all the longwords
	dbra	d6,fill_1lp2		;
	or.l	d2,(a1)+		; Set bits on right edge of bitplane
	add.l	a5,a1			; Advance to next bitplane row
	dbra	d0,fill_1lp1		; And repeat until done
	movem.l	(sp)+,d0/d3/d6-d7/a1	; Restore registers
	rts				; Return to caller
;
;	Only two longwords to copy, the left and right edges
;
Fill_1quick:
	or.l	d1,(a1)+		; Set bits on left edge of bitplane
	or.l	d2,(a1)+		; Set bits on right edge of bitplane
	add.l	a5,a1			; Move to next row
	dbra	d0,Fill_1quick		; Repeat for all rows
	movem.l	(sp)+,d0/d3/d6-d7/a1	; Restore registers
	rts				; Return to caller
;
;	Only a single longword to copy, with left and right portions masked
;
Fill_1single:
	move.l	d1,d6			; Create new mask
	and.l	d2,d6			; by combining left and right masks
Fill_1s2:
	or.l	d6,(a1)+		; Fill longword
	add.l	a5,a1			; Advance to next row
	dbra	d0,Fill_1s2		; Repeat for all rows
	movem.l	(sp)+,d0/d3/d6-d7/a1	; Restore registers
	rts				; Return to caller

;-----------------------------------------------------------------------------
;	Clear bitplane with zeros
;-----------------------------------------------------------------------------

Fill_0s:
	movem.l	d0-d3/d6-d7/a1,-(sp)	; Save registers
	not.l	d1			; Invert masks ready for AND
	not.l	d2			;
	moveq.l	#0,d7			; Clear register ready for fills
	subq.l	#1,d0			; Adjust row count to zero-based
	move.l	d3,d3			; Check how many longwords to copy
	bmi.s	Fill_0single		; Branch if only a single longword
	subq	#1,d3			; Adjust longword count to zero based
	bmi.s	Fill_0quick		; If no complete longwords, handle
;
;	We have more than two longwords to copy, so loop over them all.
;
fill_0lp1:
	and.l	d1,(a1)+		; Set bits on left edge of bitplane
	move.l	d3,d6			; Get number of longwords to fill
fill_0lp2:
	move.l	d7,(a1)+		; Fill all the longwords
	dbra	d6,fill_0lp2		;
	and.l	d2,(a1)+		; Set bits on right edge of bitplane
	add.l	a5,a1			; Advance to next bitplane row
	dbra	d0,fill_0lp1		; And repeat until done
	movem.l	(sp)+,d0-d3/d6-d7/a1	; Restore registers
	rts				; Return to caller
;
;	Only two longwords to copy, the left and right edges
;
Fill_0quick:
	and.l	d1,(a1)+		; Clear left edge of bitplane
	and.l	d2,(a1)+		; Clear right edge of bitplane
	add.l	a5,a1			; Move to next row
	dbra	d0,Fill_0quick		; Repeat for all rows
	movem.l	(sp)+,d0-d3/d6-d7/a1	; Restore registers
	rts				; Return to caller
;
;	Only a single longword to copy, with left and right portions masked
;
Fill_0single:
	move.l	d1,d6			; Combine left and right edges
	or.l	d2,d6			; to create new mask
Fill_0s2:
	and.l	d6,(a1)+		; Fill longword
	add.l	a5,a1			; Advance to next row
	dbra	d0,Fill_0s2		; Repeat for all rows
	movem.l	(sp)+,d0-d3/d6-d7/a1	; Restore registers
	rts				; Return to caller

*****************************************************************************
*
*	These two tables give the mask values used when copying the
*	bits at the edge of each bitplane row. Note that a right edge
*	of zero bits in width is handled as a special case in the code
*	(it gets converted to a bitmap which is one longword narrower
*	but has a right edge 32 bits wide).
*
*****************************************************************************

LeftMask:
	dc.l	$ffffffff,$7fffffff,$3fffffff,$1fffffff
	dc.l	$0fffffff,$07ffffff,$03ffffff,$01ffffff
	dc.l	$00ffffff,$007fffff,$003fffff,$001fffff
	dc.l	$000fffff,$0007ffff,$0003ffff,$0001ffff
	dc.l	$0000ffff,$00007fff,$00003fff,$00001fff
	dc.l	$00000fff,$000007ff,$000003ff,$000001ff
	dc.l	$000000ff,$0000007f,$0000003f,$0000001f
	dc.l	$0000000f,$00000007,$00000003,$00000001

RightMask:
	dc.l	$ffffffff,$80000000,$c0000000,$e0000000
	dc.l	$f0000000,$f8000000,$fc000000,$fe000000
	dc.l	$ff000000,$ff800000,$ffc00000,$ffe00000
	dc.l	$fff00000,$fff80000,$fffc0000,$fffe0000
	dc.l	$ffff0000,$ffff8000,$ffffc000,$ffffe000
	dc.l	$fffff000,$fffff800,$fffffc00,$fffffe00
	dc.l	$ffffff00,$ffffff80,$ffffffc0,$ffffffe0
	dc.l	$fffffff0,$fffffff8,$fffffffc,$fffffffe


*****************************************************************************
*
*	Variables used by the code. _UsageCount is only ever updated
*	atomically (since the replacement code must be re-entrant), and
*	_BlitFunc is initialised by the startup code.
*
*****************************************************************************

	SECTION Scroll,DATA

	cnop	0,4

_UsageCount:	dc.l	-1		; Number of callers currently in code
_BlitFunc:	dc.l	_StartBlit	; Address of function for blitter test
_OnlySingle:	dc.l	0		; Only use CPU when src bm == dest bm?
_Broken:	dc.l	0		; Accomodate broken software?
_MinTaskPri:	dc.b	0		; Ignore tasks with pri <= this
Pad		dc.b	0,0,0		; Padding to round to LW boundary

	END

