mc68020
	        multipass
		debug	on,lattice4

		xdef    @R_DrawColumn_040
		xdef    @R_DrawSpan_040
		xdef    @R_DrawColumn_060
		xdef    @R_DrawSpan_060

		xref    _dc_yl
		xref    _dc_yh
		xref    _dc_x
		xref    _columnofs
		xref    _ylookup
		xref    _dc_iscale
		xref    _centery
		xref    _dc_texturemid
		xref    _dc_source
		xref    _dc_colormap
		xref    _ds_xfrac
		xref    _ds_yfrac
		xref    _ds_x1
		xref    _ds_y
		xref    _ds_x2
		xref    _ds_xstep
		xref    _ds_ystep
		xref    _ds_source
		xref    _ds_colormap

SCREENWIDTH equ 320
FRACBITS    equ 16

;***********************************************************************
;@R_DrawColumn  movem.l d3-d7/a2-a5,-(sp)
;       move.l  (_dc_yl),d0
;       move.l  (_dc_yh),d7
;       sub.l   d0,d7
;       bmi.b   1$
;       move.l  (_dc_x),d1
;       lea (_columnofs),a5
;       lea (a5,d1.l*4),a1
;       lea (_ylookup),a5
;       movea.l (a5,d0.l*4),a2
;       adda.l  (a1),a2
;       move.l  (_dc_iscale),d6
;       sub.l   (_centery),d0
;       muls.l  d6,d0
;       move.l  (_dc_texturemid),d5
;       add.l   d0,d5
;       movea.l (_dc_source),a3
;       movea.l (_dc_colormap),a4
;       moveq   #127,d4
;       move.l  #SCREENWIDTH,d3
;       moveq   #0,d1       ; ensure high bits of d1 are clear
;
;;2$        move.l  d5,d0       ; frac
;;      swap    d0
;;      and.w   d4,d0       ; (frac>>16)&127
;;      move.b  (a3,d0.w),d1    ; dc_source[(frac>>FRACBITS)&127]
;;      move.b  (a4,d1.w),(a2)  ; *dest = dc_colormap[d1]
;;      adda.l  d3,a2       ; dest += SCREENWIDTH
;;      add.l   d6,d5       ; frac += fracstep
;;      dbra    d7,2$
;;1$        movem.l (sp)+,d3-d7/a2-a5
;;      rts
;
;; faster routine from j.selck@flensburg.netsurf.de:
;
;       add.w   d6,d5       ; frac += fracstep (also sets X flag)
;       swap    d5      ; swap(frac)
;       swap    d6      ; swap(fracstep)
;       and.w   d4,d5       ; (frac>>16)&127
;2$     move.b  (a3,d5.w),d1    ; dc_source[(frac>>FRACBITS)&127]
;       move.b  (a4,d1.w),(a2)  ; *dest = dc_colormap[d1]
;       addx.l  d6,d5       ; swap(frac += fracstep), use & set X
;       adda.l  d3,a2       ; dest += SCREENWIDTH
;       and.w   d4,d5       ; (frac>>16)&127
;       dbra    d7,2$       ; !! dbra slow on 68060 !!
;1$     movem.l (sp)+,d3-d7/a2-a5
;       rts


; This even faster version by Aki M Laukkanen <amlaukka@cc.helsinki.fi>

		cnop    0,4

@R_DrawColumn_060
		movem.l d2-d3/d5-d7/a2/a3,-(sp)

		move.l  (_dc_yh),d7     ; count = _dc_yh - _dc_yl
		move.l  (_dc_yl),d0
		sub.l   d0,d7
		bmi     .end

		move.l  (_dc_x),d1      ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
		lea     (_ylookup),a0
		move.l  (a0,d0.l*4),a0
		lea     (_columnofs),a1
		add.l   (a1,d1.l*4),a0

		move.l  (_dc_colormap),a2
		move.l  (_dc_source),a1

		move.l  (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
		sub.l   (_centery),d0
		muls.l  d1,d0
		add.l   (_dc_texturemid),d0

		moveq   #$7f,d3
		move.l  #SCREENWIDTH,a3

		move.l  d7,d6           ; Do the leftover iterations in
		and.w   #3,d6           ; this loop.
		beq     .skip
.skip_loop
		move.l  d0,d5
		swap    d5
		and.l   d3,d5
		move.b  (a1,d5.w),d5
		add.l   d1,d0
		move.b  (a2,d5.w),(a0)
		add.l   a3,a0
		subq.l  #1,d6
		bne     .skip_loop

; d7: cnt >> 2
; a0: chunky
; a1: texture
; a2: light_table
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac*2   (.......................................)
; d2: frac+dfrac(.......................................)
; d3: $7f
; a3: SCREENWIDTH

.skip
		lsr.l   #2,d7
		subq.l  #1,d7
		bmi     .end

		add.l   a3,a3

		move.l  d0,d2
		add.l   a3,a3
		add.l   d1,d2
		add.l   d1,d1

		eor.w   d0,d2           ; swap the fraction part for addx
		eor.w   d2,d0           ; assuming 16.16 fixed point
		eor.w   d0,d2

		swap    d0              ; swap decimals and fraction
		swap    d1
		swap    d2

		moveq   #0,d5
		and.w   d3,d2
		and.w   d3,d0

		sub.w   d1,d0
		add.l   d1,d0           ; setup the X flag

		move.b  (a1,d2.w),d5
.loop
		; This should be reasonably scheduled for
		; m68060. It should perform well on other processors
		; too. That AGU stall still bothers me though.

		move.b  (a1,d0.w),d6        ; stall + pOEP but allows sOEP
		addx.l  d1,d2               ; pOEP only
		move.b  (a2,d5.l),d5        ; pOEP but allows sOEP
		and.w   d3,d2               ; sOEP
		move.b  (a2,d6.l),d6        ; pOEP but allows sOEP
		move.b  d5,(SCREENWIDTH,a0) ; sOEP
		addx.l  d1,d0               ; pOEP only
		move.b  (a1,d2.w),d5        ; pOEP but allows sOEP
		and.w   d3,d0               ; sOEP
		move.b  d6,(a0)             ; pOEP
						; = ~4 cycles/pixel
						; + cache misses

		; The vertical writes are the true timehog of the loop
		; because of the characteristics of the copyback cache
		; operation.
		
		; Better mark the chunky buffer as write through
		; with the MMU and have all the horizontal writes
		; be longs aligned to longword boundary.

		move.b  (a1,d0.w),d6
		addx.l  d1,d2
		move.b  (a2,d5.l),d5
		and.w   d3,d2
		move.b  (a2,d6.l),d6
		move.b  d5,(SCREENWIDTH*3,a0)
		addx.l  d1,d0
		move.b  (a1,d2.w),d5
		and.w   d3,d0
		move.b  d6,(SCREENWIDTH*2,a0)

		add.l   a3,a0
.loop_end
		dbf d7,.loop

		; it's faster to divide it to two lines on 060
		; and shouldn't be slower on 040.

		move.b  (a1,d0.w),d6    ; new
		move.b  (a2,d6.l),d6    ; new
		move.b  d6,(a0)     ; new

.end
		movem.l (sp)+,d2-d3/d5-d7/a2/a3
		rts

		cnop    0,4

; 030/040 version

@R_DrawColumn_040
		movem.l d2-d4/d6-d7/a2/a3,-(sp)

		move.l  (_dc_yh),d7     ; count = _dc_yh - _dc_yl
		move.l  (_dc_yl),d0
		sub.l   d0,d7
		bmi     .end
		addq.l  #1,d7

		move.l  (_dc_x),d1      ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
		lea     (_ylookup),a0
		move.l  (a0,d0.l*4),a0
		lea     (_columnofs),a1
		add.l   (a1,d1.l*4),a0

		move.l  (_dc_colormap),d4
		move.l  (_dc_source),a1

		move.l  (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
		sub.l   (_centery),d0
		muls.l  d1,d0
		add.l   (_dc_texturemid),d0

		moveq   #$7f,d3
		move.l  #SCREENWIDTH,a3

		move.l  d7,d6           ; Do the leftover iterations in
		and.w   #3,d6           ; this loop.
		beq     .skip
.skip_loop
		move.l  d0,d2
		swap    d2
		and.l   d3,d2
		move.b  (a1,d2.w),d4
		move.l  d4,a2
		move.b  (a2),(a0)
		add.l   d1,d0
		add.l   a3,a0
		subq.l  #1,d6
		bne     .skip_loop

; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH

.skip
		lsr.l   #2,d7
		subq.l  #1,d7
		bmi     .end

		add.l   a3,a3
		add.l   a3,a3

		swap    d0              ; swap decimals and fraction
		swap    d1

		and.w   d3,d0

		sub.w   d1,d0
		add.l   d1,d0           ; setup the X flag

.loop
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		and.w   d3,d0
		move.b  (a2),(a0)
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		and.w   d3,d0
		move.b  (a2),(SCREENWIDTH,a0)
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		and.w   d3,d0
		move.b  (a2),(SCREENWIDTH*2,a0)
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		and.w   d3,d0
		move.b  (a2),(SCREENWIDTH*3,a0)

		add.l   a3,a0
.loop_end
		dbf d7,.loop
.end
		movem.l (sp)+,d2-d4/d6-d7/a2/a3
		rts


;void R_DrawColumn (void)
;{
;  int count;
;  byte* dest;
;  fixed_t frac;
;  fixed_t fracstep;
;
;  count = dc_yh - dc_yl;
;  if (count < 0)
;    return;
;  dest = ylookup[dc_yl] + columnofs[dc_x];
;  fracstep = dc_iscale;
;  frac = dc_texturemid + (dc_yl-centery)*fracstep;
;  do {
;    *dest = dc_colormap[dc_source[(frac>>FRACBITS)&127]];
;    dest += SCREENWIDTH;
;    frac += fracstep;
;  } while (count--);
;}

;***********************************************************************
;@R_DrawSpan    movem.l d2-d7/a2-a5,-(a7)
;       move.l  (_ds_x1),d0
;       lea (_columnofs),a5
;       lea (a5,d0.l*4),a1
;       move.l  (_ds_y),d1
;       lea (_ylookup),a5
;       movea.l (a5,d1.l*4),a2
;       adda.l  (a1),a2
;       move.l  (_ds_x2),d5
;       sub.l   d0,d5       ; count
;       movea.l (_ds_source),a3
;       movea.l (_ds_colormap),a4
;       move.l  (_ds_xstep),d3
;       move.l  (_ds_ystep),d4
;;-
;       moveq   #10,d2
;       moveq   #63,d6
;       move.l  #63*64,d7
;       movea.l (_ds_xfrac),a0  ; xfrac
;       movea.l (_ds_yfrac),a1  ; yfrac
;1$     move.l  a0,d0       ; xfrac
;       swap    d0
;       and.l   d6,d0       ; (xfrac>>16)&63
;       move.l  a1,d1       ; yfrac
;       asr.l   d2,d1
;       and.l   d7,d1       ; (yfrac>>10)&(63*64)
;       add.l   d0,d1       ; spot
;       moveq   #0,d0
;       move.b  (a3,d1.l),d0    ; ds_source[spot]
;       move.b  (a4,d0.w),(a2)+ ; *dest++ = ds_colormap[...]
;       adda.l  d3,a0       ; xfrac += ds_xstep
;       adda.l  d4,a1       ; yfrac += ds_ystep
;       dbra    d5,1$
;       movem.l (a7)+,d2-d7/a2-a5
;       rts

; This faster version by Aki M Laukkanen <amlaukka@cc.helsinki.fi>

		cnop    0,4

@R_DrawSpan_060
		movem.l d2-d7/a2/a3,-(sp)

		move.l  (_ds_y),d0
		move.l  (_ds_x1),d1     ; dest = ylookup[_ds_y] + columnofs[_ds_x1]
		lea     (_ylookup),a0
		move.l  (a0,d0.l*4),a0
		lea     (_columnofs),a1
		add.l   (a1,d1.l*4),a0

		move.l  (_ds_source),a1
		move.l  (_ds_colormap),a2

		move.l  (_ds_x2),d7     ; count = _ds_x2 - _ds_x1
		sub.l   d1,d7
		addq.l  #1,d7

		move.l  (_ds_xfrac),d0
		move.l  (_ds_yfrac),d1
		move.l  (_ds_xstep),d2
		move.l  (_ds_ystep),d3

		move.l  a0,d4
		btst    #0,d4
		beq     .skipb

		move.l  d0,d5           ; do the unaligned pixels
		move.l  d1,d6           ; so we can write to longword
		swap    d5              ; boundary in the main loop
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		add.l   d2,d0
		move.b  (a2,d5.w),(a0)+
		add.l   d3,d1
		move.l  a0,d4
		subq.l  #1,d7
.skipb
		btst    #1,d4
		beq     .skips
		moveq   #2,d4
		cmp.l   d4,d7
		bls     .skips

		move.l  d0,d5           ; write two pixels
		move.l  d1,d6
		swap    d5
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		move.w  (a2,d5.w),d4
		add.l   d2,d0
		add.l   d3,d1
		move.l  d0,d5
		move.l  d1,d6
		swap    d5
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		move.b  (a2,d5.w),d4
		add.l   d2,d0
		move.w  d4,(a0)+
		add.l   d3,d1
		subq.l  #2,d7
.skips

; a0: chunky
; a1: texture
; a2: light_table
; d7: count >> 2 
; d0: xfrac (vvvv vvvv vvvv vvvv 1111 1111 11UU UUUU)
; d1: yfrac (uuuu uuuu uuuu uuuu 1111 VVVV VV11 1111)
; d2: dxfrac
; d3: dyfrac

		move.l  d7,d6           ; setup registers
		and.w   #3,d6
		move.l  d6,a3

		eor.w   d0,d1           ; swap fraction parts for addx
		eor.w   d2,d3
		eor.w   d1,d0
		eor.w   d3,d2
		eor.w   d0,d1
		eor.w   d2,d3

		swap    d0
		swap    d1
		swap    d2
		swap    d3

		lsl.w   #6,d1
		lsl.w   #6,d3

		moveq   #0,d6
		moveq   #0,d5

		sub.l   #$f000,a1

		lsr.l   #2,d7
		beq     .skip_loop2
		subq.l  #1,d7

		sub.w   d3,d1
		add.l   d3,d1           ; setup the X flag

		or.w    #$ffc0,d0
		or.w    #$f03f,d1

		move.w  d0,d6
		and.w   d1,d6
		bra     .start_loop2

		cnop    0,8
.loop2
		; This should be reasonably scheduled for m68060.
		; It writes long words to long word aligned locations.
		; First of all that's the optimal way if you write
		; directly to a frame buffer on graphics cards.
		; Same holds true if you change the chunky buffer
		; cache mode to write through. See R_DrawColumn().

		or.w    #$ffc0,d0       ; pOEP
		or.w    #$f03f,d1       ; sOEP
		move.b  (a2,d5.l),d4    ; pOEP but allows sOEP
		move.w  d0,d6           ; sOEP
		and.w   d1,d6           ; pOEP
		move.l  d4,(a0)+        ; sOEP
.start_loop2
		addx.l  d2,d0           ; pOEP only
		addx.l  d3,d1           ; pOEP only
		move.b  (a1,d6.l),d5    ; pOEP but allows sOEP

		or.w    #$ffc0,d0       ; sOEP
		or.w    #$f03f,d1       ; pOEP
		move.w  d0,d6           ; sOEP
		move.w  (a2,d5.l),d4    ; pOEP but allows sOEP
		and.w   d1,d6           ; sOEP
		addx.l  d2,d0           ; pOEP only
		addx.l  d3,d1           ; pOEP only
		move.b  (a1,d6.l),d5    ; pOEP but allows sOEP

		or.w    #$ffc0,d0       ; sOEP
		or.w    #$f03f,d1       ; pOEP
		move.w  d0,d6           ; sOEP
		move.b  (a2,d5.l),d4    ; pOEP but allows sOEP
		and.w   d1,d6           ; sOEP
		addx.l  d2,d0           ; pOEP only
		addx.l  d3,d1           ; pOEP only
		move.b  (a1,d6.l),d5    ; pOEP but allows sOEP
		
		or.w    #$ffc0,d0       ; sOEP
		or.w    #$f03f,d1       ; pOEP
		move.w  d0,d6           ; sOEP
		swap    d4              ; pOEP only
		move.w  (a2,d5.l),d4    ; pOEP but allows sOEP
		and.w   d1,d6           ; sOEP
		addx.l  d2,d0           ; pOEP only
		addx.l  d3,d1           ; pOEP only
		move.b  (a1,d6.l),d5    ; pOEP but allows sOEP
		dbf     d7,.loop2       ; pOEP only
					; = 7.75 cycles/pixel
		move.b  (a2,d5.l),d4
		move.l  d4,(a0)+
.skip_loop2

		sub.w   d3,d1
		add.l   d3,d1

		move.l  a3,d7
		bra     .loop_end2
.loop3
		or.w    #$ffc0,d0
		or.w    #$f03f,d1
		move.w  d0,d6
		and.w   d1,d6
		addx.l  d2,d0
		addx.l  d3,d1
		move.b  (a1,d6.l),d5
		move.b  (a2,d5.l),(a0)+
.loop_end2
		dbf     d7,.loop3
.end2
		movem.l (sp)+,d2-d7/a2/a3
		rts

		cnop    0,4

; 030/040 version

@R_DrawSpan_040
		movem.l d2-d7/a2-a4,-(sp)

		move.l  (_ds_y),d0
		move.l  (_ds_x1),d1     ; dest = ylookup[_ds_y] + columnofs[_ds_x1]
		lea     (_ylookup),a0
		move.l  (a0,d0.l*4),a0
		lea     (_columnofs),a1
		add.l   (a1,d1.l*4),a0

		move.l  (_ds_source),a1
		move.l  (_ds_colormap),a2

		move.l  (_ds_x2),d7     ; count = _ds_x2 - _ds_x1
		sub.l   d1,d7
		addq.l  #1,d7

		move.l  (_ds_xfrac),d0
		move.l  (_ds_yfrac),d1
		move.l  (_ds_xstep),d2
		move.l  (_ds_ystep),d3

		move.l  a0,d4
		btst    #0,d4
		beq     .skipb

		move.l  d0,d5           ; do the unaligned pixels
		move.l  d1,d6           ; so we can write to longword
		swap    d5              ; boundary in the main loop
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		add.l   d2,d0
		move.b  (a2,d5.w),(a0)+
		add.l   d3,d1
		move.l  a0,d4
		subq.l  #1,d7
.skipb
		btst    #1,d4
		beq     .skips
		moveq   #2,d4
		cmp.l   d4,d7
		bls     .skips

		move.l  d0,d5           ; write two pixels
		move.l  d1,d6
		swap    d5
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		move.w  (a2,d5.w),d4
		add.l   d2,d0
		add.l   d3,d1
		move.l  d0,d5
		move.l  d1,d6
		swap    d5
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		move.b  (a2,d5.w),d4
		add.l   d2,d0
		move.w  d4,(a0)+
		add.l   d3,d1
		subq.l  #2,d7
.skips

; a0: chunky
; a4: chunky end
; a1: texture
; d4: light_table
; d0: xfrac (vvvv vvvv vvvv vvvv 1111 1111 11UU UUUU)
; d1: yfrac (uuuu uuuu uuuu uuuu 1111 VVVV VV11 1111)
; d2: dxfrac
; d3: dyfrac
; d6: x_or
; d7: y_or

		move.l  a2,d4
		add.l   #$1000,a1       ; catch 22

		move.l  a0,a3
		add.l   d7,a3

		move.l  d7,d5
		and.b   #~3,d5

		move.l  a0,a4
		add.l   d5,a4

		eor.w   d0,d1           ; swap fraction parts for addx
		eor.w   d2,d3
		eor.w   d1,d0
		eor.w   d3,d2
		eor.w   d0,d1
		eor.w   d2,d3

		swap    d0
		swap    d1
		swap    d2
		swap    d3

		lsl.w   #6,d1
		lsl.w   #6,d3

		move.w  #$ffc0,d6
		move.w  #$f03f,d7

		lsr.w   #2,d5
		beq     .skip_loop2

		sub.w   d3,d1
		add.l   d3,d1           ; setup the X flag

.loop2
		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d2,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		move.l  d4,a2
		move.w  (a2),d5
		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d2,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		move.l  d4,a2
		move.b  (a2),d5
		swap    d5
		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d2,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		move.l  d4,a2
		move.w  (a2),d5
		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d2,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		move.l  d4,a2
		move.b  (a2),d5

		move.l  d5,(a0)+
		cmp.l   a0,a4
		bne     .loop2
.skip_loop2

		sub.w   d3,d1
		add.l   d3,d1

		bra     .loop_end2
.loop3
		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d2,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		move.l  d4,a2
		move.b  (a2),(a0)+
.loop_end2
		cmp.l   a0,a3
		bne     .loop3
.end2
		movem.l (sp)+,d2-d7/a2-a4
		rts


;void R_DrawSpan (void)
;{
;  fixed_t xfrac, yfrac;
;  byte* dest;
;  int count, spot;
;
;  xfrac = ds_xfrac;
;  yfrac = ds_yfrac;
;  dest = ylookup[ds_y] + columnofs[ds_x1];
;  count = ds_x2 - ds_x1;
;  do {
;    spot = ((yfrac>>(16-6))&(63*64)) + ((xfrac>>16)&63);
;    *dest++ = ds_colormap[ds_source[spot]];
;    xfrac += ds_xstep;
;    yfrac += ds_ystep;
;  } while (count--);
;}

;***********************************************************************

end