		mc68020
		multipass
	if (_eval(DEBUG)&$8000)
		debug	on,lattice4
	endc

; ---------------------------------------------------------------------

; void __asm c2p8 (register __a2 UBYTE *fBUFFER,
;                  register __a3 UBYTE *fBUFFER_CMP,
;                  register __a4 PLANEPTR *planes,
;                  register __d0 ULONG signals1,
;                  register __d1 ULONG signals2,
;                  register __d2 ULONG pixels,     // width*height
;                  register __d3 ULONG offset,     // byte offset into plane
;                  register __a6 struct GfxBase *GfxBase,
;                  register __a5 UBYTE *xlate,
;                  register __d4 UWORD force_update);
;
; Pipelined CPU+blitter 8-plane chunky to planar converter.
; Optimised for 68020/30 with fastmem.
;
; Author: Peter McGavin (e-mail peterm@maths.grace.cri.nz), 21 April 1994
; Based on James McCoull's 4-pass blitter algorithm.
;
; This code is public domain.
;
; Use chunky comparison buffer.  Return immediately if no diffs found.
; Perform first 2 passes (Fast->Chip) with the CPU (in 1 pass).
; Only convert 32-pixel "units" that changed since last time.
; Update chunky comparison buffer.
; If nothing has changed, signal signals1 and return immediately.
; Wait for previous QBlit() to completely finish (signals2).
; Then launch passes 3 & 4 with QBlit().
; Return immediately after launching passes 3 & 4.
; Signal signals1 (asynchronously) after completion of pass 3.
; Signal signals2 from CleanUp() on completion of QBlit().
; Calling task must wait for signals1 before next call to c2p8().
;
; Approx timing (A4000/030, 320x200x8):
;	CPU pass min 13ms, max 37ms, depending how different (then return)
;	Asynchronous blitter passes add 62ms
;	Max framerate (with changes every frame) is 62ms/frame = 16fps
;		occurs when fBUFFER rendering time <= 25ms (= 62ms-37ms)
;
; (Unimplemented speedup idea: remember locations of the first and last
; differences, then blit only the parts that have changed.)
;
; (Another unimplemented speedup idea: use a "scrambled" chunky buffer.
; Skip pass 1.)
;
; Example usage:
;
;	/* clear fBUFFER, fBUFFER_CMP, and planes here */
;	if ((sigbit1 = AllocSignal(-1)) == -1 ||
;	    (sigbit2 = AllocSignal(-1)) == -1)
;		die ("Can't allocate signal!\n");
;	SetSignal ((1<<sigbit1)|(1<<sigbit2),  // initial state is "finished"
;		   (1<<sigbit1)|(1<<sigbit2));
;	for (;;) {
;		/* render to fBUFFER here */
;		Wait (1<<sigbit1);  // wait for prev c2p8() to finish pass 3
;	        c2p8 (fBUFFER, fBUFFER_CMP, &RASTPORT->BitMap->Planes[0],
;		      1<<sigbit1, 1<<sigbit2, WIDTH*HEIGHT,
;                     WIDTH/8*LINESTOSKIP, GfxBase);
;	}
;	Wait (1<<sigbit1);  // wait for last c2p8 to finish pass 3
;	Wait (1<<sigbit2);  // wait for last c2p8 to completely finish
;	FreeSignal(sigbit1);
;	FreeSignal(sigbit2);
;
; ---------------------------------------------------------------------

maxwidth	equ	320	; must be a multiple of 32
maxheight	equ	200
maxpixels	equ	maxwidth*maxheight

cleanup		equ	$40

merge		macro	; \1 data1, \2 data2, \3=shift \4=mask, \5 temp
		move.l	\2,\5
		lsr.l	#\3,\5
		eor.l	\1,\5
		and.l	\4,\5
		eor.l	\5,\1
		lsl.l	#\3,\5
		eor.l	\5,\2
		endm

xload		macro	; translate 4 8-bit pixels to 6-bit EHB using xlate[]
		move.b	(\1-16,a2),d4
		move.b	(a6,d4.w),\2
		lsl.w	#8,\2
		move.b	(\1-16+8,a2),d4
		move.b	(a6,d4.w),\2
		swap	\2
		move.b	(\1-16+2,a2),d4
		move.b	(a6,d4.w),\2
		lsl.w	#8,\2
		move.b	(\1-16+10,a2),d4
		move.b	(a6,d4.w),\2
		endm

;		section chunks,code

	ifeq	depth-8
		xdef	_c2p_8_020
_c2p_8_020:
	else
	ifeq	depth-6
		xdef	_c2p_6_020
_c2p_6_020:
	else
		fail	"unsupported depth!"
	endc
	endc
		movem.l	d2-d7/a2-a6,-(sp)

; save arguments

		move.l	#mybltnode,a0
		move.l	a2,(chunky-mybltnode,a0)
		move.l	a4,(planes-mybltnode,a0)
		move.l	d0,(signals1-mybltnode,a0)
		move.l	d1,(signals2-mybltnode,a0)
		move.l	d2,(pixels-mybltnode,a0)
		lsr.l	#1,d2
		move.l	d2,(pixels2-mybltnode,a0)
		lsr.l	#1,d2
		move.l	d2,(pixels4-mybltnode,a0)
		lsr.l	#1,d2
		move.l	d2,(pixels8-mybltnode,a0)
		lsr.l	#1,d2
		move.l	d2,(pixels16-mybltnode,a0)
		move.l	d3,(offset-mybltnode,a0)
		move.l	a6,(gfxbase-mybltnode,a0)
	IFLE depth-6
		move.l	a5,(xlate-mybltnode,a0)
		move.w	d4,(force_update-mybltnode,a0)
	ENDC

;-------------------------------------------------
;original chunky data
;0		a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0
;2		c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
;4		e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0
;6		g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
;8		i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0
;10		k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
;12		m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0
;14		o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
;16		q7q6q5q4q3q2q1q0 r7r6r5r4r3r2r1r0
;18		s7s6s5s4s3s2s1s0 t7t6t5t4t3t2t1t0
;20		u7u6u5u4u3u2u1u0 v7v6v5v4v3v2v1v0
;22		w7w6w5w4w3w2w1w0 x7x6x5x4x3x2x1x0
;24		y7y6y5y4y3y2y1y0 z7z6z5z4z3z2z1z0
;26		A7A6A5A4A3A2A1A0 B7B6B5B4B3B2B1B0
;28		C7C6C5C4C3C2C1C0 D7D6D5D4D3D2D1D0
;30		E7E6E5E4E3E2E1E0 F7F6F5F4F3F2F1F0
;-------------------------------------------------

		move.l	(pixels16-mybltnode,a0),d6
		lsr.l	#1,d6		; loop count = pixels/32

	IFLE depth-6
		move.l	(xlate-mybltnode,a0),a6	; a6 -> xlate
	ENDC

		move.l	(pixels4-mybltnode,a0),d0
		move.l	#buff2,a0	; a0 -> buff2 (in Chip)
		lea	(a0,d0.l),a1	; a1 -> buff2+pixels/4
		lea	(a1,d0.l),a4	; a4 -> buff2+pixels/2
		lea	(a4,d0.l),a5	; a5 -> buff2+3*pixels/4

		move.l	#$0f0f0f0f,d7	; constant
		move.l	#$00ff00ff,d5	; constant

		bra.b	end_pass1loop

		cnop	0,4

; main loop (starts here) processes 32 chunky pixels at a time
; compare next 32 pixels with compare page, looking for differences

initpass1loop:
	IFLE depth-6
		tst.w	(force_update)	; force_update?
		bne	go_c2p
	ENDC
		cmpm.l	(a2)+,(a3)+
		bne	fix1
		cmpm.l	(a2)+,(a3)+
		bne	fix2
		cmpm.l	(a2)+,(a3)+
		bne	fix3
		cmpm.l	(a2)+,(a3)+
		bne	fix4
		cmpm.l	(a2)+,(a3)+
		bne	fix5
		cmpm.l	(a2)+,(a3)+
		bne	fix6
		cmpm.l	(a2)+,(a3)+
		bne	fix7
		cmpm.l	(a2)+,(a3)+
		bne	fix8

		addq.l	#8,a0		; skip 8 bytes in output
		addq.l	#8,a1		; skip 8 bytes in output
		addq.l	#8,a4		; skip 8 bytes in output
		addq.l	#8,a5		; skip 8 bytes in output

end_pass1loop:	dbra	d6,initpass1loop

		bra	done2

		cnop	0,4

; This becomes the main loop after the first difference is found

pass1loop:
	IFLE depth-6
		tst.w	(force_update)	; force_update?
		bne	go_c2p
	ENDC
		cmpm.l	(a2)+,(a3)+
		bne	fix1
		cmpm.l	(a2)+,(a3)+
		bne	fix2
		cmpm.l	(a2)+,(a3)+
		bne	fix3
		cmpm.l	(a2)+,(a3)+
		bne	fix4
		cmpm.l	(a2)+,(a3)+
		bne	fix5
		cmpm.l	(a2)+,(a3)+
		bne	fix6
		cmpm.l	(a2)+,(a3)+
		bne	fix7
		cmpm.l	(a2)+,(a3)+
		bne	fix8

		addq.l	#8,a0		; skip 8 bytes in output
		addq.l	#8,a1		; skip 8 bytes in output
		addq.l	#8,a4		; skip 8 bytes in output
		addq.l	#8,a5		; skip 8 bytes in output

		dbra	d6,pass1loop

		bra	done

; difference found, restore a2 and a3

fix8:		sub.w	#32,a2
		sub.w	#32,a3
		bra.b	go_c2p

fix7:		sub.w	#28,a2
		sub.w	#28,a3
		bra.b	go_c2p

fix6:		sub.w	#24,a2
		sub.w	#24,a3
		bra.b	go_c2p

fix5:		sub.w	#20,a2
		sub.w	#20,a3
		bra.b	go_c2p

fix4:		sub.w	#16,a2
		sub.w	#16,a3
		bra.b	go_c2p

fix3:		subq.l	#4,a2
		subq.l	#4,a3
fix2:		subq.l	#4,a2
		subq.l	#4,a3
fix1:		subq.l	#4,a2
		subq.l	#4,a3

; convert 32 pixels (passes 1 and 2 combined)

go_c2p:		movem.l	(a2)+,d0-d3	; AaBbCcDd EeFfGgHh IiJjKkLl MmNnOoPp

		movem.l	d0-d3,(a3)	; update compare buffer
		adda.w	#16,a3

	IFGT depth-6
		merge	d0,d2,8,d5,d4	; d0=AaIiCcKk d2=BbJjDdLl
		merge	d1,d3,8,d5,d4	; d1=EeMmGgOo d3=FfNnHhPp
	ELSE
		moveq	#0,d4
		xload	0,d0		; d0=xlate[AaIiCcKk]
		xload	4,d1		; d1=xlate[EeMmGgOo]
		xload	1,d2		; d2=xlate[BbJjDdLl]
		xload	5,d3		; d3=xlate[FfNnHhPp]
	ENDC
		move.l	d1,d4
		lsr.l	#4,d4
		eor.l	d0,d4
		and.l	d7,d4
		eor.l	d4,d0		; d0=AEIMCGKO

		move.l	d0,(a0)+

		lsl.l	#4,d4
		eor.l	d4,d1		; d1=aeimcgko
		move.l	d3,d4
		lsr.l	#4,d4

		move.l	d1,(a4)+

		eor.l	d2,d4
		and.l	d7,d4
		eor.l	d4,d2		; d2=BFJNDHLP

		move.l	d2,(a1)+

		lsl.l	#4,d4
		eor.l	d4,d3		; d3=bfjndhlp

		move.l	d3,(a5)+

		bchg	#16,d6		; repeat inner loop twice
		beq	go_c2p

		dbra	d6,pass1loop

; wait until previous QBlit() has completely finished (signals2)
; then start the blitter in the background for passes 3 & 4

done:		move.l	#mybltnode,a2	; a2->mybltnode
		move.l	(4).w,a6	; a6->SysBase
		move.l	(ThisTask,a6),(task-mybltnode,a2) ; save task ptr
		move.l	(signals2-mybltnode,a2),d0
		jsr	(_LVOWait,a6)

		move.l	a2,a1
		move.l	(gfxbase-mybltnode,a2),a6
		jsr	(_LVOQBlit,a6)

		bra	ret


; If we get to here then no difference was found.
; Signal the task (signals1) and return.

done2:		move.l	#mybltnode,a2
		move.l	(signals1-mybltnode,a2),d0
		move.l	d0,d1
		move.l	(4).w,a6	; a6->SysBase
		jsr	(_LVOSetSignal,a6)

ret:		movem.l	(sp)+,d2-d7/a2-a6
		rts

;-----------------------------------------------------------------------------
; QBlit functions (called asynchronously)

;-------------------------------------------------
;buff2 after pass 2
;0		a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4
;2		c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
;4		q7q6q5q4u7u6u5u4 y7y6y5y4C7C6C5C4
;6		s7s6s5s4w7w6w5w4 A7A6A5A4E7E6E5E4
;
;Pixels/4+0	b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4
;Pixels/4+2	d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
;Pixels/4+4	r7r6r5r4v7v6v5v4 z7z6z5z4D7D6D5D4
;Pixels/4+6	t7t6t5t4x7x6x5x4 B7B6B5B4F7F6F5F4
;
;Pixels/2+0	a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0
;Pixels/2+2	c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
;Pixels/2+4	q3q2q1q0u3u2u1u0 y3y2y1y0C3C2C1C0
;Pixels/2+6	s3s2s1s0w3w2w1w0 A3A2A1A0E3E2E1E0
;
;3*Pixels/4+0	b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0	
;3*Pixels/4+2	d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
;3*Pixels/4+4	r3r2r1r0v3v2v1v0 z3z2z1z0D3D2D1D0
;3*Pixels/4+6	t3t2t1t0x3x2x1x0 B3B2B1B0F3F2F1F0
;-------------------------------------------------

;Pass 3, subpass 1
;	apt		Buff2
;	bpt		Buff2+2
;	dpt		Buff3
;	amod		2
;	bmod		2
;	dmod		0
;	cdat		$cccc
;	sizv		Pixels/4
;	sizh		1 word
;	con		D=AC+(B>>2)~C, ascending

blit31:		moveq	#-1,d0
		move.l	d0,(bltafwm,a0)
		move.w	#0,(bltdmod,a0)
		move.l	#buff2,(bltapt,a0)
		move.l	#buff2+2,(bltbpt,a0)
		move.l	#buff3,(bltdpt,a0)
		move.w	#2,(bltamod,a0)			; 2
		move.w	#2,(bltbmod,a0)			; 2
		move.w	(pixels4+2-mybltnode,a1),(bltsizv,a0)	; pixels/4
		move.w	#$cccc,(bltcdat,a0)
		move.l	#$0DE42000,(bltcon0,a0)	; D=AC+(B>>2)~C
		move.w	#1,(bltsizh,a0)		;do blit
		lea	(blit32,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

;Pass 3, subpass 2
;	apt		Buff2+Pixels-2-2
;	bpt		Buff2+Pixels-2
;	dpt		Buff3+Pixels-2
;	amod		2
;	bmod		2
;	dmod		0
;	cdat		$cccc
;	sizv		Pixels/4
;	sizh		1 word
;	con		D=(A<<2)C+B~C, descending

blit32:		move.l	#buff2,d0
		add.l	(pixels-mybltnode,a1),d0
		subq.l	#2+2,d0
		move.l	d0,(bltapt,a0)		; buff2+pixels-2-2
		addq.l	#2,d0
		move.l	d0,(bltbpt,a0)		; buff2+pixels-2
		add.l	#buff3-buff2,d0
		move.l	d0,(bltdpt,a0)		; buff3+pixels-2
		move.l	#$2DE40002,(bltcon0,a0)	; D=(A<<2)C+B~C, desc.
		move.w	#1,(bltsizh,a0)		;do blit
	IFGT depth-6
		lea	(blit47,pc),a0
	ELSE
		lea	(blit43,pc),a0
	ENDC
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

;-------------------------------------------------
;buff3 after pass 3
;0		a7a6c7c6e7e6g7g6 i7i6k7k6m7m6o7o6
;2		q7q6s7s6u7u6w7w6 y7y6A7A6C7C6E7E6
;
;Pixels/8+0	b7b6d7d6f7f6h7h6 j7j6l7l6n7n6p7p6
;Pixels/8+2	r7r6t7t6v7v6x7x6 z7z6B7B6D7D6F7F6
;
;Pixels/4+0	a3a2c3c2e3e2g3g2 i3i2k3k2m3m2o3o2
;Pixels/4+2	q3q2s3s2u3u2w3w2 y3y2A3A2C3C2E3E2
;
;3*Pixels/8+0	b3b2d3d2f3f2h3h2 j3j2l3l2n3n2p3p2
;3*Pixels/8+2	r3r2t3t2v3v2x3x2 z3z2B3B2D3D2F3F2
;
;Pixels/2+0	a5a4c5c4e5e4g5g4 i5i4k5k4m5m4o5o4
;Pixels/2+2	q5q4s5s4u5u4w5w4 y5y4A5A4C5C4E5E4
;
;5*Pixels/8+0	b5b4d5d4f5f4h5h4 j5j4l5l4n5n4p5p4
;5*Pixels/8+2	r5r4t5t4v5v4x5x4 z5z4B5B4D5D4F5F4
;
;3*Pixels/4+0	a1a0c1c0e1e0g1g0 i1i0k1k0m1m0o1o0
;3*Pixels/4+2	q1q0s1s0u1u0w1w0 y1y0A1A0C1C0E1E0
;
;7*Pixels/8+0	b1b0d1d0f1f0h1h0 j1j0l1l0n1n0p1p0
;7*Pixels/8+2	r1r0t1t0v1v0x1x0 z1z0B1B0D1D0F1F0
;-------------------------------------------------

	IFGT depth-6

;Pass 4, plane 7
;	apt		Buff3+0*pixels/8
;	bpt		Buff3+1*pixels/8
;	dpt		Plane7+offset
;	amod		0
;	bmod		0
;	dmod		0
;	cdat		$aaaa
;	sizv		Pixels/16
;	sizh		1 word
;	con		D=AC+(B>>1)~C, ascending

blit47:		movem.l	a2,-(sp)

		move.w	#0,(bltamod,a0)
		move.w	#0,(bltbmod,a0)
		move.w	(pixels16+2-mybltnode,a1),(bltsizv,a0)	; pixels/16
		move.w	#$aaaa,(bltcdat,a0)
		move.l	#$0DE41000,(bltcon0,a0)	; D=AC+(B>>1)~C

		move.l	#buff3,d0
		move.l	d0,(bltapt,a0)		; buff3+0*pixels/8
		add.l	(pixels8-mybltnode,a1),d0
		move.l	d0,(bltbpt,a0)		; buff3+1*pixels/8
		move.l	(planes-mybltnode,a1),a2
		move.l	(7*4,a2),d0
		add.l	(offset-mybltnode,a1),d0
		move.l	d0,(bltdpt,a0)		; Plane7+offset
		move.w	#1,(bltsizh,a0)		;plane 7

		movem.l	a1/a6,-(sp)
		move.l	(signals1-mybltnode,a1),d0
		move.l	(task-mybltnode,a1),a1
		move.l	(4).w,a6		; a6->SysBase
		jsr	(_LVOSignal,a6)		; signal pass 3 has finished
		movem.l	(sp)+,a1/a6

		lea	(blit43,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		movem.l	(sp)+,a2
		rts

;-------------------------------------------------
;Plane7		a7b7c7d7e7f7g7h7 i7j7k7l7m7n7o7p7
;Plane7+2	q7r7s7t7u7v7w7x7 y7z7A7B7C7D7E7F7
;-------------------------------------------------

	ENDC

;Pass 4, plane 3
;	apt		buff3+2*pixels/8
;	bpt		buff3+3*pixels/8
;	dpt		Plane3+offset
;	amod		0
;	bmod		0
;	dmod		0
;	cdat		$aaaa
;	sizv		pixels/16
;	sizh		1 word
;	con		D=AC+(B>>1)~C, ascending

blit43:		move.l	a2,-(sp)		; preserve a2
	IFLE depth-6
		move.w	#0,(bltamod,a0)
		move.w	#0,(bltbmod,a0)
		move.w	(pixels16+2-mybltnode,a1),(bltsizv,a0)	; pixels/16
		move.w	#$aaaa,(bltcdat,a0)
		move.l	#$0DE41000,(bltcon0,a0)	; D=AC+(B>>1)~C
	ENDC
		move.l	#buff3,d0
		add.l	(pixels4-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; buff3+2*pixels/8
		add.l	(pixels8-mybltnode,a1),d0
		move.l	d0,(bltbpt,a0)		; buff3+3*pixels/8
		move.l	(planes-mybltnode,a1),a2
		move.l	(3*4,a2),d0
		add.l	(offset-mybltnode,a1),d0
		move.l	d0,(bltdpt,a0)		; Plane3+offset
		move.w	#1,(bltsizh,a0)		;plane 3
	IFLE depth-6
		movem.l	a1/a6,-(sp)
		move.l	(signals1-mybltnode,a1),d0
		move.l	(task-mybltnode,a1),a1
		move.l	(4).w,a6		; a6->SysBase
		jsr	(_LVOSignal,a6)		; signal pass 3 has finished
		movem.l	(sp)+,a1/a6
	ENDC
		lea	(blit45,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		move.l	(sp)+,a2		; restore a2
		rts

;-------------------------------------------------
;Plane3		a3b3c3d3e3f3g3h3 i3j3k3l3m3n3o3p3
;Plane3+2	q3r3s3t3u3v3w3x3 y3z3A3B3C3D3E3F3
;-------------------------------------------------

;Pass 4, plane 5
;	apt		buff3+4*pixels/8
;	bpt		buff3+5*pixels/8
;	dpt		Plane5+offset
;	amod		0
;	bmod		0
;	dmod		0
;	cdat		$aaaa
;	sizv		pixels/16
;	sizh		1 word
;	con		D=AC+(B>>1)~C, ascending

blit45:		move.l	a2,d1			; preserve a2
		move.l	#buff3,d0
		add.l	(pixels2-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; buff3+4*pixels/8
		add.l	(pixels8-mybltnode,a1),d0
		move.l	d0,(bltbpt,a0)		; buff3+5*pixels/8
		move.l	(planes-mybltnode,a1),a2
		move.l	(5*4,a2),d0
		add.l	(offset-mybltnode,a1),d0
		move.l	d0,(bltdpt,a0)		; Plane5+offset
		move.w	#1,(bltsizh,a0)		;plane 5
		lea	(blit41,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		move.l	d1,a2			; restore a2
		rts

;-------------------------------------------------
;Plane5		a5b5c5d5e5f5g5h5 i5j5k5l5m5n5o5p5
;Plane5+2	q5r5s5t5u5v5w5x5 y5z5A5B5C5D5E5F5
;-------------------------------------------------

;Pass 4, plane 1
;	apt		buff3+6*pixels/8
;	bpt		buff3+7*pixels/8
;	dpt		Plane1+offset
;	amod		0
;	bmod		0
;	dmod		0
;	cdat		$aaaa
;	sizv		pixels/16
;	sizh		1 word
;	con		D=AC+(B>>1)~C, ascending

blit41:		move.l	a2,d1			; preserve a2
		move.l	#buff3,d0
		add.l	(pixels4-mybltnode,a1),d0
		add.l	(pixels2-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; buff3+6*pixels/8
		add.l	(pixels8-mybltnode,a1),d0
		move.l	d0,(bltbpt,a0)		; buff3+7*pixels/8
		move.l	(planes-mybltnode,a1),a2
		move.l	(1*4,a2),d0
		add.l	(offset-mybltnode,a1),d0
		move.l	d0,(bltdpt,a0)		; Plane1+offset
		move.w	#1,(bltsizh,a0)		;plane 1
	IFGT depth-6
		lea	(blit46,pc),a0
	ELSE
		lea	(blit42,pc),a0
	ENDC
		move.l	a0,(qblitfunc-mybltnode,a1)
		move.l	d1,a2			; restore a2
		rts

;-------------------------------------------------
;Plane1		a1b1c1d1e1f1g1h1 i1j1k1l1m1n1o1p1
;Plane1+2	q1r1s1t1u1v1w1x1 y1z1A1B1C1D1E1F1
;-------------------------------------------------

	IFGT depth-6

;Pass 4, plane 6
;	apt		buff3+1*pixels/8-2
;	bpt		buff3+2*pixels/8-2
;	dpt		Plane6+plsiz-2+offset
;	amod		0
;	bmod		0
;	dmod		0
;	cdat		$aaaa
;	sizv		pixels/16
;	sizh		1 word
;	con		D=(A<<1)C+B~C, descending

blit46:		move.l	a2,d1			; preserve a2
		move.l	#buff3,d0
		add.l	(pixels8-mybltnode,a1),d0
		subq.l	#2,d0
		move.l	d0,(bltapt,a0)		; buff3+1*pixels/8-2
		add.l	(pixels8-mybltnode,a1),d0
		move.l	d0,(bltbpt,a0)		; buff3+2*pixels/8-2
		move.l	(planes-mybltnode,a1),a2
		move.l	(6*4,a2),d0
		add.l	(offset-mybltnode,a1),d0
		add.l	(pixels8-mybltnode,a1),d0
		subq.l	#2,d0
		move.l	d0,(bltdpt,a0)		; Plane6+offset+plsiz-2
		move.l	#$1DE40002,(bltcon0,a0)	; D=(A<<1)C+B~C, desc.
		move.w	#1,(bltsizh,a0)		;plane 6
		lea	(blit42,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		move.l	d1,a2			; restore a2
		rts

;-------------------------------------------------
;Plane6		a6b6c6d6e6f6g6h6 i6j6k6l6m6n6o6p6
;Plane6+2	q6r6s6t6u6v6w6x6 y6z6A6B6C6D6E6F6
;-------------------------------------------------

	ENDC

;Pass 4, plane 2
;	apt		buff3+3*pixels/8-2
;	bpt		buff3+4*pixels/8-2
;	dpt		Plane2+plsiz-2+offset
;	amod		0
;	bmod		0
;	dmod		0
;	cdat		$aaaa
;	sizv		pixels/16
;	sizh		1 word
;	con		D=(A<<1)C+B~C, descending

blit42:		move.l	a2,d1			; preserve a2
		move.l	#buff3,d0
		add.l	(pixels2-mybltnode,a1),d0
		subq.l	#2,d0
		move.l	d0,(bltbpt,a0)		; buff3+4*pixels/8-2
		sub.l	(pixels8-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; buff3+3*pixels/8-2
		move.l	(planes-mybltnode,a1),a2
		move.l	(2*4,a2),d0
		add.l	(offset-mybltnode,a1),d0
		add.l	(pixels8-mybltnode,a1),d0
		subq.l	#2,d0
		move.l	d0,(bltdpt,a0)		; Plane2+offset+plsiz-2
	IFLE depth-6
		move.l	#$1DE40002,(bltcon0,a0)	; D=(A<<1)C+B~C, desc.
	ENDC
		move.w	#1,(bltsizh,a0)		;plane 2
		lea	(blit44,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		move.l	d1,a2			; restore a2
		rts

;-------------------------------------------------
;Plane2		a2b2c2d2e2f2g2h2 i2j2k2l2m2n2o2p2
;Plane2+2	q2r2s2t2u2v2w2x2 y2z2A2B2C2D2E2F2
;-------------------------------------------------

;Pass 4, plane 4
;	apt		buff3+5*pixels/8-2
;	bpt		buff3+6*pixels/8-2
;	dpt		Plane4+plsiz-2+offset
;	amod		0
;	bmod		0
;	dmod		0
;	cdat		$aaaa
;	sizv		pixels/16
;	sizh		1 word
;	con		D=(A<<1)C+B~C, descending

blit44:		move.l	a2,d1			; preserve a2
		move.l	#buff3,d0
		add.l	(pixels2-mybltnode,a1),d0
		add.l	(pixels4-mybltnode,a1),d0
		subq.l	#2,d0
		move.l	d0,(bltbpt,a0)		; buff3+6*pixels/8-2
		sub.l	(pixels8-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; buff3+5*pixels/8-2
		move.l	(planes-mybltnode,a1),a2
		move.l	(4*4,a2),d0
		add.l	(offset-mybltnode,a1),d0
		add.l	(pixels8-mybltnode,a1),d0
		subq.l	#2,d0
		move.l	d0,(bltdpt,a0)		; Plane4+offset+plsiz-2
		move.w	#1,(bltsizh,a0)		;plane 4
		lea	(blit40,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		move.l	d1,a2			; restore a2
		rts

;-------------------------------------------------
;Plane4		a4b4c4d4e4f4g4h4 i4j4k4l4m4n4o4p4
;Plane4+2	q4r4s4t4u4v4w4x4 y4z4A4B4C4D4E4F4
;-------------------------------------------------

;Pass 4, plane 0
;	apt		buff3+7*pixels/8-2
;	bpt		buff3+8*pixels/8-2
;	dpt		Plane0+plsiz-2+offset
;	amod		0
;	bmod		0
;	dmod		0
;	cdat		$aaaa
;	sizv		pixels/16
;	sizh		1 word
;	con		D=(A<<1)C+B~C, descending

blit40:		move.l	a2,d1			; preserve a2
		move.l	#buff3,d0
		add.l	(pixels-mybltnode,a1),d0
		subq.l	#2,d0
		move.l	d0,(bltbpt,a0)		; buff3+8*pixels/8-2
		sub.l	(pixels8-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; buff3+7*pixels/8-2
		move.l	(planes-mybltnode,a1),a2
		move.l	(a2),d0
		add.l	(offset-mybltnode,a1),d0
		add.l	(pixels8-mybltnode,a1),d0
		subq.l	#2,d0
		move.l	d0,(bltdpt,a0)		; Plane0+offset+plsiz-2
		move.w	#1,(bltsizh,a0)		;plane 0
		lea	(blit31,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		move.l	d1,a2			; restore a2
		moveq	#0,d0			; set Z flag
		rts

;-------------------------------------------------
;Plane0		a0b0c0d0e0f0g0h0 i0j0k0l0m0n0o0p0
;Plane0+2	q0r0s0t0u0v0w0x0 y0z0A0B0C0D0E0F0
;-------------------------------------------------

qblitcleanup:	movem.l	a2/a6,-(sp)
		move.l	#mybltnode,a2
		move.l	(task-mybltnode,a2),a1	; signal QBlit() has finished
		move.l	(signals2-mybltnode,a2),d0
		move.l	(4).w,a6
		jsr	(_LVOSignal,a6)		; may be called from interrupts
		movem.l	(sp)+,a2/a6
		rts

;-----------------------------------------------------------------------------
		section	data,data

		quad
mybltnode:	dc.l	0		; next bltnode
qblitfunc:	dc.l	blit31		; ptr to qblitfunc()
		dc.b	cleanup		; stat
		dc.b	0		; filler
		dc.w	0		; blitsize
		dc.w	0		; beamsync
		dc.l	qblitcleanup	; ptr to qblitcleanup()

		quad
chunky:		dc.l	0		; ptr to original chunky data
planes:		dc.l	0		; ptr to list of output plane ptrs
pixels:		dc.l	0		; width*height
pixels2:	dc.l	0		; width*height/2
pixels4:	dc.l	0		; width*height/4
pixels8:	dc.l	0		; width*height/8
pixels16:	dc.l	0		; width*height/16
offset:		dc.l	0		; byte offset into plane
task:		dc.l	0		; ptr to this task
signals1:	dc.l	0		; signals to Signal() after pass 3
signals2:	dc.l	0		; signals to Signal() at cleanup
gfxbase:	dc.l	0		; GfxBase
xlate:		dc.l	0
force_update:	dc.w	0

;-----------------------------------------------------------------------------
		section	segment1,bss,chip		; MUST BE IN CHIP !!!!!

buff2		ds.b	maxpixels	;Intermediate buffer 2
buff3		ds.b	maxpixels	;Intermediate buffer 3

;-----------------------------------------------------------------------------

		end
