*
*       amiga_draw.s - optimized rendering
*       by Aki Laukkanen <amlaukka@cc.helsinki.fi>
*
*       This file is public domain.
*

;		mc68020
;		multipass
;	if (_eval(DEBUG)&$8000)
;		debug	on,lattice4
;	endc

		include "exec/types.i"
		include "exec/funcdef.i"
		include "exec/exec_lib.i"

;-----------------------------------------------------------------------

SCREENWIDTH	equ	320

FRACBITS	equ	16
FRACUNIT	equ	(1<<FRACBITS)

*
*       global functions
*

		xdef	_init_r_draw
		xdef	@init_r_draw

;;		xdef    _R_DrawColumn_030			; high detail
;;		xdef    @R_DrawColumn_030
		xdef    _R_DrawColumn_040			; high detail
		xdef    @R_DrawColumn_040
		xdef    _R_DrawSpan_040
		xdef    @R_DrawSpan_040
		xdef    _R_DrawColumn_060
		xdef    @R_DrawColumn_060
		xdef    _R_DrawSpan_060
		xdef    @R_DrawSpan_060
		xdef	_R_DrawFuzzColumn
		xdef	@R_DrawFuzzColumn
;;		xdef	_R_DrawTranslatedColumn
;;		xdef	@R_DrawTranslatedColumn

		xdef	_R_DrawSpanLow				; low detail
		xdef	@R_DrawSpanLow
		xdef	_R_DrawColumnLow
		xdef	@R_DrawColumnLow
		xdef	_R_DrawFuzzColumnLow
		xdef	@R_DrawFuzzColumnLow
;;		xdef	_R_DrawTranslatedColumnLow
;;		xdef	@R_DrawTranslatedColumnLow

		xdef	_R_RenderSegLoop
		xdef	@R_RenderSegLoop

		xdef	@R_MakeSpans
		xdef	_R_MakeSpans

		xdef	@R_DrawPlanes
		xdef	_R_DrawPlanes

		xdef	_R_DrawMaskedColumn
		xdef	@R_DrawMaskedColumn

		xdef	@R_MapPlane
		xdef	_R_MapPlane

		xdef	@R_GetColumn
		xdef	_R_GetColumn

		xdef	@W_CacheLumpNum
		xdef	_W_CacheLumpNum

		xdef	_V_DrawPatch
		xdef	@V_DrawPatch
		xdef	_V_DrawPatchDirect
		xdef	@V_DrawPatchDirect

		xdef	@P_DivlineSide
		xdef	_P_DivlineSide

*
*       needed symbols/labels
*

		xref	_SysBase
		xref	_SCREENWIDTH
		xref	_SCREENHEIGHT
		xref    _dc_yl
		xref    _dc_yh
		xref    _dc_x
		xref    _columnofs
;;		xref    _ylookup
		xref    _ylookup2
		xref    _dc_iscale
		xref    _centery
		xref    _dc_texturemid
		xref    _dc_source
		xref    _dc_colormap
		xref    _ds_xfrac
		xref    _ds_yfrac
		xref    _ds_x1
		xref    _ds_y
		xref    _ds_x2
		xref    _ds_xstep
		xref    _ds_ystep
		xref    _ds_source
		xref    _ds_colormap
		xref    _fuzzoffset
		xref	_fuzzpos
		xref	_viewheight
		xref    _dc_translation
		xref	_colormaps

;-----------------------------------------------------------------------
		section	text,code

; low detail drawing functions

;-----------------------------------------------------------------------
; patch _SCREENWIDTH into draw routines (self-modifying code) and flush caches

_init_r_draw
@init_r_draw
		movem.l	a6,-(sp)

		lea	@init_r_draw(pc),a0

		move.l	_SCREENWIDTH(a4),d0	; d0 = _SCREENWIDTH

		move.w	d0,sw12_1-@init_r_draw+2(a0)
		move.w	d0,sw12_3-@init_r_draw+2(a0)
		move.w	d0,sw12_4-@init_r_draw+2(a0)
		move.w	d0,sw12_4a-@init_r_draw+2(a0)
		move.w	d0,sw12_5-@init_r_draw+2(a0)
		move.w	d0,sw12_5a-@init_r_draw+2(a0)
		move.w	d0,sw12_6-@init_r_draw+2(a0)
		move.w	d0,sw12_7-@init_r_draw+2(a0)
		move.w	d0,sw12_8-@init_r_draw+2(a0)

		add.l	d0,d0			; d0 = 2 * _SCREENWIDTH

		move.w	d0,sw22_1-@init_r_draw+2(a0)
		move.w	d0,sw22_3-@init_r_draw+2(a0)
		move.w	d0,sw22_4-@init_r_draw+2(a0)
		move.w	d0,sw22_4a-@init_r_draw+2(a0)
		move.w	d0,sw22_5-@init_r_draw+2(a0)
		move.w	d0,sw22_5a-@init_r_draw+2(a0)
		move.w	d0,sw22_6-@init_r_draw+2(a0)
		move.w	d0,sw22_7-@init_r_draw+2(a0)
		move.w	d0,sw22_8-@init_r_draw+2(a0)

		add.l	_SCREENWIDTH(a4),d0	; d0 = 3 * _SCREENWIDTH

		move.w	d0,sw32_1-@init_r_draw+2(a0)
		move.w	d0,sw32_3-@init_r_draw+2(a0)
		move.w	d0,sw32_4-@init_r_draw+2(a0)
		move.w	d0,sw32_4a-@init_r_draw+2(a0)
		move.w	d0,sw32_5-@init_r_draw+2(a0)
		move.w	d0,sw32_5a-@init_r_draw+2(a0)
		move.w	d0,sw32_6-@init_r_draw+2(a0)
		move.w	d0,sw32_7-@init_r_draw+2(a0)
		move.w	d0,sw32_8-@init_r_draw+2(a0)

		add.l	_SCREENWIDTH(a4),d0	; d0 = 4 * _SCREENWIDTH

		move.w	d0,sw42_1-@init_r_draw+2(a0)
		move.w	d0,sw42_3-@init_r_draw+2(a0)
		move.w	d0,sw42_4-@init_r_draw+2(a0)
		move.w	d0,sw42_5-@init_r_draw+2(a0)
		move.w	d0,sw42_6-@init_r_draw+2(a0)
		move.w	d0,sw42_8-@init_r_draw+2(a0)

		move.l	_SCREENWIDTH(a4),d0
		neg.l	d0			; d0 = -_SCREENWIDTH

		move.w	d0,swm10_1-@init_r_draw(a0)
		move.w	d0,swm10_3-@init_r_draw(a0)
		move.w	d0,swm10_4-@init_r_draw(a0)
		move.w	d0,swm10_5-@init_r_draw(a0)
		move.w	d0,swm10_6-@init_r_draw(a0)
		move.w	d0,swm10_8-@init_r_draw(a0)

		add.l	d0,d0			; d0 = -2*_SCREENWIDTH

		move.w	d0,swm20_1-@init_r_draw(a0)
		move.w	d0,swm20_3-@init_r_draw(a0)
		move.w	d0,swm20_4-@init_r_draw(a0)
		move.w	d0,swm20_5-@init_r_draw(a0)
		move.w	d0,swm20_6-@init_r_draw(a0)
		move.w	d0,swm20_8-@init_r_draw(a0)

		sub.l	_SCREENWIDTH(a4),d0	; d0 = -3*_SCREENWIDTH

		move.w	d0,swm30_1-@init_r_draw(a0)
		move.w	d0,swm30_3-@init_r_draw(a0)
		move.w	d0,swm30_4-@init_r_draw(a0)
		move.w	d0,swm30_5-@init_r_draw(a0)
		move.w	d0,swm30_6-@init_r_draw(a0)
		move.w	d0,swm30_8-@init_r_draw(a0)

		movea.l	_SysBase(a4),a6
		jsr	_LVOCacheClearU(a6)

		movem.l	(sp)+,a6
		rts

;-----------------------------------------------------------------------
		cnop	0,4

_R_DrawColumnLow
@R_DrawColumnLow
		movem.l d3-d4/d6-d7/a2/a3,-(sp)

		move.l  _dc_yh(a4),d7	; count = _dc_yh - _dc_yl
		move.l  _dc_yl(a4),d0
		sub.l   d0,d7
		bmi.w   .end1

		move.l  _dc_x(a4),d1    ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
		movea.l	_ylookup2(a4),a0 ;     = ylookup2[_dc_yl] + (_dc_x<<1)
		add.l	d1,d1		; dc_x <<= 1 
		move.l  (a0,d0.l*4),a0
		adda.l	d1,a0

		move.l  _dc_colormap(a4),d4
		move.l  _dc_source(a4),a1

		move.l  _dc_iscale(a4),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
		sub.l   _centery(a4),d0
		muls.l  d1,d0
		add.l   _dc_texturemid(a4),d0

		moveq   #$7f,d3
sw42_1		lea     (SCREENWIDTH*4).w,a3

; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH

		move.l  d7,d6
		and.w   #3,d6

		swap    d0              ; swap decimals and fraction
		swap    d1

		add.w   .width_tab1(pc,d6.w*2),a0
		lsr.w   #2,d7
		move.w  .tmap_tab1(pc,d6.w*2),d6

		and.w   d3,d0
		sub.w   d1,d0
		add.l   d1,d0           ; setup the X flag

		jmp 	.loop1(pc,d6.w)

		cnop    0,4
.width_tab1
swm30_1		dc.w    -3*SCREENWIDTH
swm20_1		dc.w    -2*SCREENWIDTH
swm10_1		dc.w    -1*SCREENWIDTH
		dc.w    0
.tmap_tab1
		dc.w    .01-.loop1
		dc.w    .11-.loop1
		dc.w    .21-.loop1
		dc.w    .31-.loop1
.loop1
.31
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		move.w  (a2),d6
		and.w   d3,d0
		move.b	(a2),d6
		move.w	d6,(a0)
.21
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		move.w  (a2),d6
		and.w   d3,d0
		move.b	(a2),d6
sw12_1		move.w	d6,SCREENWIDTH(a0)
.11
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		move.w  (a2),d6
		and.w   d3,d0
		move.b	(a2),d6
sw22_1		move.w	d6,SCREENWIDTH*2(a0)
.01
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		move.w	(a2),d6
		and.w   d3,d0
		move.b  (a2),d6
sw32_1		move.w	d6,SCREENWIDTH*3(a0)

		add.l   a3,a0
.loop_end1
		dbf 	d7,.loop1
.end1
		movem.l (sp)+,d3-d4/d6-d7/a2/a3
		rts

;-----------------------------------------------------------------------
		cnop	0,4

_R_DrawSpanLow
@R_DrawSpanLow
		movem.l d2-d7/a2-a4,-(sp)
		move.l  _ds_y(a4),d0
		move.l  _ds_x1(a4),d1	; dest = ylookup[_ds_y] + columnofs[_ds_x1]
		movea.l	_ylookup2(a4),a0 ;     = ylookup2[_ds_y] + _ds_x
		add.l	d1,d1
		move.l  (a0,d0.l*4),a0
		adda.l	d1,a0
		move.l  _ds_x2(a4),d7	; count = _ds_x2 - _ds_x1
		move.l  _ds_source(a4),a1
		add.l	d7,d7
		move.l  _ds_colormap(a4),a2
		sub.l   d1,d7
		addq.l	#2,d7
		move.l  _ds_xfrac(a4),d0
		move.l  _ds_yfrac(a4),d1
		move.l  _ds_xstep(a4),d2
		move.l  _ds_ystep(a4),d3
		move.l  a0,d4		; notice, that this address must already be aligned by word
		btst    #1,d4
		beq.b   .skips2
		move.l  d0,d5           ; do the unaligned pixels
		move.l  d1,d6           ; so we can write to longword
		swap    d5              ; boundary in the main loop
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6		; this is the worst possible
		lsl.w   #6,d6		; way but hey, this is not a loop
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		add.l   d2,d0
		move.b  (a2,d5.w),(a0)+
		add.l   d3,d1
		move.b	(a2,d5.w),(a0)+	; I know this is crap but spare me the comments
		subq.l  #2,d7
.skips2		move.l  a2,d4
		lea     $1000(a1),a1	; catch 22
		move.l  a0,a3
		add.l   d7,a3
		move.l  d7,d5
		and.b   #~7,d5
		move.l  a0,a4
		add.l   d5,a4
		eor.w   d0,d1           ; swap fraction parts for addx
		eor.w   d2,d3
		eor.w   d1,d0
		eor.w   d3,d2
		eor.w   d0,d1
		eor.w   d2,d3
		swap    d0
		swap    d1
		swap    d2
		swap    d3
		lsl.w   #6,d1
		lsl.w   #6,d3
		move.w  #$ffc0,d6
		move.w  #$f03f,d7
		lsr.w   #3,d5
		beq.b   .skip_loop22
		sub.w   d2,d0
		add.l   d2,d0           ; setup the X flag
.loop22		or.w    d6,d0		; Not really and exercise in optimizing
		or.w    d7,d1		; but I guess it's faster than 1x1 for 030
		and.w   d1,d0		; where this low detail business is needed.
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.w  (a2),d5
		or.w    d6,d0
		move.b	(a2),d5
		or.w    d7,d1
		and.w   d1,d0
		swap	d5
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.w  (a2),d5
		or.w    d6,d0
		move.b	(a2),d5
		or.w    d7,d1
		and.w   d1,d0
		move.l	d5,(a0)+
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.w  (a2),d5
		or.w    d6,d0
		move.b	(a2),d5
		or.w    d7,d1
		and.w   d1,d0
		swap	d5
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.w  (a2),d5
		move.b	(a2),d5
		move.l  d5,(a0)+
		cmp.l   a0,a4
		bne.b   .loop22
.skip_loop22
		sub.w   d2,d0
		add.l   d2,d0

		bra.b   .loop_end22
.loop32  	or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.b  (a2),(a0)+
		move.b	(a2),(a0)+
.loop_end22
		cmp.l   a0,a3
		bne.b   .loop32
.end22		movem.l (sp)+,d2-d7/a2-a4
		rts

;-----------------------------------------------------------------------
		cnop	0,4

_R_DrawTranslatedColumnLow
@R_DrawTranslatedColumnLow
		movem.l d2-d4/d6-d7/a2/a3,-(sp)

		move.l  _dc_yh(a4),d7	; count = _dc_yh - _dc_yl
		move.l  _dc_yl(a4),d0
		sub.l   d0,d7
		bmi.w   .end3

		move.l  _dc_x(a4),d1	; dest = ylookup[_dc_yl] + columnofs[_dc_x]
		movea.l	_ylookup2(a4),a0 ;     = ylookup2[_dc_yl] + _dc_x
		add.l	d1,d1
		move.l  (a0,d0.l*4),a0
;;		movea.l	_columnofs(a4),a1
;;		add.l   (a1,d1.l*4),a0
		adda.l	d1,a0			; new

		move.l	_dc_translation(a4),d2
		move.l  _dc_colormap(a4),d4
		move.l  _dc_source(a4),a1

		move.l  _dc_iscale(a4),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
		sub.l   _centery(a4),d0
		muls.l  d1,d0
		add.l   _dc_texturemid(a4),d0

		moveq   #$7f,d3
sw42_3		lea     (SCREENWIDTH*4).w,a3

; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; d2: translation table aligned to 256 byte boundary
; a3: SCREENWIDTH

		move.l  d7,d6
		and.w   #3,d6

		swap    d0              ; swap decimals and fraction
		swap    d1

		add.w   .width_tab3(pc,d6.w*2),a0
		lsr.w   #2,d7
		move.w  .tmap_tab3(pc,d6.w*2),d6

		and.w   d3,d0
		sub.w   d1,d0
		add.l   d1,d0           ; setup the X flag

		jmp 	.loop3(pc,d6.w)

		cnop    0,4
.width_tab3
swm30_3		dc.w    -3*SCREENWIDTH
swm20_3		dc.w    -2*SCREENWIDTH
swm10_3		dc.w    -1*SCREENWIDTH
		dc.w    0
.tmap_tab3
		dc.w    .03-.loop3
		dc.w    .13-.loop3
		dc.w    .23-.loop3
		dc.w    .33-.loop3
.loop3
.33
		move.b  (a1,d0.w),d2
		move.l	d2,a2
		addx.l  d1,d0
		move.b	(a2),d4
		move.l  d4,a2
		and.w   d3,d0
		move.w	(a2),d6
		move.b  (a2),d6
		move.w	d6,(a0)
.23
		move.b  (a1,d0.w),d2
		move.l	d2,a2
		addx.l  d1,d0
		move.b	(a2),d4
		move.l  d4,a2
		and.w   d3,d0
		move.w	(a2),d6
		move.b  (a2),d6
sw12_3		move.w	d6,SCREENWIDTH(a0)
.13
		move.b  (a1,d0.w),d2
		move.l	d2,a2
		addx.l  d1,d0
		move.b	(a2),d4
		move.l  d4,a2
		and.w   d3,d0
		move.w	(a2),d6
		move.b	(a2),d6
sw22_3		move.w  d6,SCREENWIDTH*2(a0)
.03
		move.b  (a1,d0.w),d2
		move.l	d2,a2
		addx.l  d1,d0
		move.b	(a2),d4
		move.l  d4,a2
		and.w   d3,d0
		move.w	(a2),d6
		move.b	(a2),d6
sw32_3		move.b  d6,SCREENWIDTH*3(a0)

		add.l   a3,a0
.loop_end3
		dbf 	d7,.loop3
.end3
		movem.l (sp)+,d2-d4/d6-d7/a2/a3
		rts

;-----------------------------------------------------------------------
		cnop	0,4

_R_DrawFuzzColumnLow
@R_DrawFuzzColumnLow
		movem.l d4/d6-d7/a2/a3,-(sp)

		move.l	_viewheight(a4),d1
		subq.l	#1,d1
		move.l  _dc_yh(a4),d7	; count = _dc_yh - _dc_yl
		cmp.l	d1,d7
		bne.b	.skip_yh4
		subq.l	#1,d1
		move.l	d1,d7
.skip_yh4
		move.l  _dc_yl(a4),d0
		bne.b	.skip_yl4
		moveq	#1,d0
.skip_yl4
		sub.l   d0,d7
		bmi.w   .end4

		move.l  _dc_x(a4),d1	; dest = ylookup[_dc_yl] + columnofs[_dc_x]
		movea.l	_ylookup2(a4),a0 ;     = ylookup2[_dc_yl] + _dc_x
		add.l	d1,d1
		move.l  (a0,d0.l*4),a0
		adda.l	d1,a0

		move.l  _colormaps(a4),d4
		add.l	#6*256,d4

		movea.l	_fuzzoffset(a4),a1
		move.l	_fuzzpos(a4),d0	; bring it down 
.pos_loop4	sub.l	_SCREENHEIGHT(a4),d0
		bpl	.pos_loop4
		add.l	_SCREENHEIGHT(a4),d0
		add.l	d0,a1

sw42_4		lea     (SCREENWIDTH*4).w,a3

; d7: cnt >> 2
; a0: chunky
; a1: fuzzoffset
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH

		move.l  d7,d6
		and.w   #3,d6

		add.w   .width_tab4(pc,d6.w*2),a0
		lsr.w   #2,d7
		move.w  .tmap_tab4(pc,d6.w*2),d6

		jmp 	.loop4(pc,d6.w)

		cnop    0,4
.width_tab4
swm30_4		dc.w    -3*SCREENWIDTH
swm20_4		dc.w    -2*SCREENWIDTH
swm10_4		dc.w    -1*SCREENWIDTH
		dc.w    0
.tmap_tab4
		dc.w    .04-.loop4
		dc.w    .14-.loop4
		dc.w    .24-.loop4
		dc.w    .34-.loop4
.loop4
.34		move.l	a0,a2			; This is essentially
		add.l	(a1)+,a2		; just moving memory around.
		move.b	(a2),d4
		move.l	d4,a2			
		move.w	(a2),d6
		move.b	(a2),d6
		move.w	d6,(a0)		
.24
sw12_4		lea	SCREENWIDTH(a0),a2	
		add.l	(a1)+,a2		
		move.b	(a2),d4			
		move.l	d4,a2
		move.w	(a2),d6
		move.b	(a2),d6
sw12_4a		move.w	d6,SCREENWIDTH(a0)
.14
sw22_4		lea	2*SCREENWIDTH(a0),a2
		add.l	(a1)+,a2
		move.b	(a2),d4
		move.l	d4,a2
		move.w	(a2),d6
		move.b	(a2),d6
sw22_4a		move.w	d6,2*SCREENWIDTH(a0)
.04
sw32_4		lea	3*SCREENWIDTH(a0),a2
		add.l	(a1)+,a2
		move.b	(a2),d4
		move.l	d4,a2
		move.w	(a2),d6
		move.b	(a2),d6
sw32_4a		move.w	d6,3*SCREENWIDTH(a0)

		add.l   a3,a0
.loop_end4
		dbf	d7,.loop4
		sub.l	_fuzzoffset(a4),a1
		move.l	a1,_fuzzpos
.end4
		movem.l (sp)+,d4/d6-d7/a2/a3
		rts

;-----------------------------------------------------------------------
; high detail versions

;-----------------------------------------------------------------------
		cnop	0,4

_R_DrawFuzzColumn
@R_DrawFuzzColumn
		movem.l d4/d6-d7/a2/a3,-(sp)

		move.l	_viewheight(a4),d1
		subq.l	#1,d1
		move.l  _dc_yh(a4),d7	; count = _dc_yh - _dc_yl
		cmp.l	d1,d7
		bne.b	.skip_yh5
		subq.l	#1,d1
		move.l	d1,d7
.skip_yh5
		move.l  _dc_yl(a4),d0
		bne.b	.skip_yl5
		moveq	#1,d0
.skip_yl5
		sub.l   d0,d7
		bmi.w   .end5

		movea.l	_ylookup2(a4),a0 ; dest = ylookup2[_dc_yl] + dc_x
		move.l  (a0,d0.l*4),a0
		adda.l	_dc_x(a4),a0

		move.l  _colormaps(a4),d4
		add.l	#6*256,d4

		movea.l	_fuzzoffset(a4),a1
		move.l	_fuzzpos(a4),d0
.pos_loop5	sub.l	_SCREENHEIGHT(a4),d0
		bpl	.pos_loop5
		add.l	_SCREENHEIGHT(a4),d0
		add.l	d0,a1

sw42_5		lea     (SCREENWIDTH*4).w,a3

; d7: cnt >> 2
; a0: chunky
; a1: fuzzoffset
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH

		move.l  d7,d6
		and.w   #3,d6

		add.w   .width_tab5(pc,d6.w*2),a0
		lsr.w   #2,d7
		move.w  .tmap_tab5(pc,d6.w*2),d6

		jmp 	.loop5(pc,d6.w)

		cnop    0,4
.width_tab5
swm30_5		dc.w    -3*SCREENWIDTH
swm20_5		dc.w    -2*SCREENWIDTH
swm10_5		dc.w    -1*SCREENWIDTH
		dc.w    0
.tmap_tab5
		dc.w    .05-.loop5
		dc.w    .15-.loop5
		dc.w    .25-.loop5
		dc.w    .35-.loop5
.loop5
.35		move.l	a0,a2			; This is essentially
		add.l	(a1)+,a2		; just moving memory around.
		move.b	(a2),d4
		move.l	d4,a2			; Not 060 optimized but
		move.b	(a2),(a0)		; if you have hordes of
.25
sw12_5		lea	SCREENWIDTH(a0),a2	; invisible monsters which
		add.l	(a1)+,a2		; slow down the game too much,
		move.b	(a2),d4			; do tell me.
		move.l	d4,a2
sw12_5a		move.b	(a2),SCREENWIDTH(a0)
.15
sw22_5		lea	2*SCREENWIDTH(a0),a2
		add.l	(a1)+,a2
		move.b	(a2),d4
		move.l	d4,a2
sw22_5a		move.b	(a2),2*SCREENWIDTH(a0)
.05
sw32_5		lea	3*SCREENWIDTH(a0),a2
		add.l	(a1)+,a2
		move.b	(a2),d4
		move.l	d4,a2
sw32_5a		move.b	(a2),3*SCREENWIDTH(a0)

		add.l   a3,a0
.loop_end5
		dbf	d7,.loop5
		sub.l	_fuzzoffset(a4),a1
		move.l	a1,_fuzzpos
.end5
		movem.l (sp)+,d4/d6-d7/a2/a3
		rts

;-----------------------------------------------------------------------
		cnop	0,4

_R_DrawTranslatedColumn					; no 060 version :(
@R_DrawTranslatedColumn
		movem.l d2-d4/d6-d7/a2/a3,-(sp)

		move.l  _dc_yh(a4),d7	; count = _dc_yh - _dc_yl
		move.l  _dc_yl(a4),d0
		sub.l   d0,d7
		bmi.w   .end6

;;		move.l  _dc_x(a4),d1	; dest = ylookup[_dc_yl] + columnofs[_dc_x]
		movea.l	_ylookup2(a4),a0 ;     = ylookup2[_dc_yl] + _dc_x
		move.l  (a0,d0.l*4),a0
;;		movea.l	_columnofs(a4),a1
;;		add.l   (a1,d1.l*4),a0
		adda.l	_dc_x(a4),a0	; new

		move.l	_dc_translation(a4),d2
		move.l  _dc_colormap(a4),d4
		move.l  _dc_source(a4),a1

		move.l  _dc_iscale(a4),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
		sub.l   _centery(a4),d0
		muls.l  d1,d0
		add.l   _dc_texturemid(a4),d0

		moveq   #$7f,d3
sw42_6		lea     (SCREENWIDTH*4).w,a3

; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; d2: translation table aligned to 256 byte boundary
; a3: SCREENWIDTH

		move.l  d7,d6
		and.w   #3,d6

		swap    d0              ; swap decimals and fraction
		swap    d1

		add.w   .width_tab6(pc,d6.w*2),a0
		lsr.w   #2,d7
		move.w  .tmap_tab6(pc,d6.w*2),d6

		and.w   d3,d0
		sub.w   d1,d0
		add.l   d1,d0           ; setup the X flag

		jmp 	.loop6(pc,d6.w)

		cnop    0,4
.width_tab6
swm30_6		dc.w    -3*SCREENWIDTH
swm20_6		dc.w    -2*SCREENWIDTH
swm10_6		dc.w    -1*SCREENWIDTH
		dc.w    0
.tmap_tab6
		dc.w    .06-.loop6
		dc.w    .16-.loop6
		dc.w    .26-.loop6
		dc.w    .36-.loop6
.loop6
.36
		move.b  (a1,d0.w),d2
		move.l	d2,a2
		addx.l  d1,d0
		move.b	(a2),d4
		and.w   d3,d0
		move.l  d4,a2
		move.b  (a2),(a0)
.26
		move.b  (a1,d0.w),d2
		move.l	d2,a2
		addx.l  d1,d0
		move.b	(a2),d4
		and.w   d3,d0
		move.l  d4,a2
sw12_6		move.b  (a2),SCREENWIDTH(a0)
.16
		move.b  (a1,d0.w),d2
		move.l	d2,a2
		addx.l  d1,d0
		move.b	(a2),d4
		and.w   d3,d0
		move.l  d4,a2
sw22_6		move.b  (a2),SCREENWIDTH*2(a0)
.06
		move.b  (a1,d0.w),d2
		move.l	d2,a2
		addx.l  d1,d0
		move.b	(a2),d4
		and.w   d3,d0
		move.l  d4,a2
sw32_6		move.b  (a2),SCREENWIDTH*3(a0)

		add.l   a3,a0
.loop_end6
		dbf 	d7,.loop6
.end6
		movem.l (sp)+,d2-d4/d6-d7/a2/a3
		rts

;-----------------------------------------------------------------------
		cnop	0,4

; routine from j.selck@flensburg.netsurf.de   (Aki's 040 routine is faster)

;_R_DrawColumn_030
;@R_DrawColumn_030
;		movem.l	d3-d7/a2-a5,-(sp)
;		move.l	_dc_yl(a4),d0
;		move.l	_dc_yh(a4),d7
;		sub.l	d0,d7
;		bmi.b	1$
;		move.l	_dc_x(a4),d1
;		movea.l	_columnofs(a4),a5
;		lea	(a5,d1.l*4),a1
;		movea.l	_ylookup(a4),a5
;		movea.l	(a5,d0.l*4),a2
;		adda.l	(a1),a2
;		move.l	_dc_iscale(a4),d6
;		sub.l	_centery(a4),d0
;		muls.l	d6,d0
;		move.l	_dc_texturemid(a4),d5
;		add.l	d0,d5
;		movea.l	_dc_source(a4),a3
;		movea.l	_dc_colormap(a4),a4
;		moveq	#127,d4
;		move.l	_SCREENWIDTH(a4),d3
;		moveq	#0,d1		; ensure high bits of d1 are clear
;		add.w	d6,d5		; frac += fracstep (also sets X flag)
;		swap	d5		; swap(frac)
;		swap	d6		; swap(fracstep)
;		and.w	d4,d5		; (frac>>16)&127
;2$		move.b	(a3,d5.w),d1	; dc_source[(frac>>FRACBITS)&127]
;		move.b	(a4,d1.w),(a2)	; *dest = dc_colormap[d1]
;		addx.l	d6,d5		; swap(frac += fracstep), use & set X
;		adda.l	d3,a2		; dest += SCREENWIDTH
;		and.w	d4,d5		; (frac>>16)&127
;		dbra	d7,2$
;1$		movem.l	(sp)+,d3-d7/a2-a5
;		rts

;-----------------------------------------------------------------------
		cnop	0,4

_R_DrawColumn_060
@R_DrawColumn_060
		movem.l d2-d3/d5-d7/a2/a3,-(sp)

		move.l  (_dc_yh),d7     ; count = _dc_yh - _dc_yl
		move.l  (_dc_yl),d0
		sub.l   d0,d7
		bmi.w   .end7

;;		move.l  (_dc_x),d1      ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
		movea.l	(_ylookup2),a0	;      = ylookup2[_dc_yl] + _dc_x
		move.l  (a0,d0.l*4),a0
;;		movea.l	(_columnofs),a1
;;		add.l   (a1,d1.l*4),a0
		adda.l	(_dc_x),a0	; new

		move.l  (_dc_colormap),a2
		move.l  (_dc_source),a1

		move.l  (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
		sub.l   (_centery),d0
		muls.l  d1,d0
		add.l   (_dc_texturemid),d0

		moveq   #$7f,d3
		move.l  (_SCREENWIDTH),a3

		move.l  d7,d6           ; Do the leftover iterations in
		and.w   #3,d6           ; this loop.
		addq.w	#1,d6
.skip_loop7
		move.l  d0,d5
		swap    d5
		and.l   d3,d5
		move.b  (a1,d5.w),d5
		add.l   d1,d0
		move.b  (a2,d5.w),(a0)
		add.l   a3,a0
		subq.w  #1,d6
		bne.b   .skip_loop7
; d7: cnt >> 2
; a0: chunky
; a1: texture
; a2: light_table
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac*2   (.......................................)
; d2: frac+dfrac(.......................................)
; d3: $7f
; a3: SCREENWIDTH
.skip7
		lsr.l   #2,d7
		subq.l	#1,d7
		bmi.b	.end7

		add.l   a3,a3

		move.l  d0,d2
		add.l   a3,a3
		add.l   d1,d2
		add.l   d1,d1

		eor.w   d0,d2           ; swap the fraction part for addx
		eor.w   d2,d0           ; assuming 16.16 fixed point
		eor.w   d0,d2

		swap    d0              ; swap decimals and fraction
		swap    d1
		swap    d2

		moveq   #0,d5
		and.w   d3,d2
		and.w   d3,d0

		sub.w   d1,d0
		add.l   d1,d0           ; setup the X flag

		move.b  (a1,d2.w),d5
.loop7
		; This should be reasonably scheduled for
		; m68060. It should perform well on other processors
		; too. That AGU stall still bothers me though.

		move.b  (a1,d0.w),d6        ; stall + pOEP but allows sOEP
		addx.l  d1,d2               ; pOEP only
		move.b  (a2,d5.l),d5        ; pOEP but allows sOEP
		and.w   d3,d2               ; sOEP
		move.b  (a2,d6.l),d6        ; pOEP but allows sOEP
sw12_7		move.b  d5,SCREENWIDTH(a0)  ; sOEP
		addx.l  d1,d0               ; pOEP only
		move.b  (a1,d2.w),d5        ; pOEP but allows sOEP
		and.w   d3,d0               ; sOEP
		move.b  d6,(a0)             ; pOEP
						; = ~4 cycles/pixel
						; + cache misses

		; The vertical writes are the true timehog of the loop
		; because of the characteristics of the copyback cache
		; operation.
		
		; Better mark the chunky buffer as write through
		; with the MMU and have all the horizontal writes
		; be longs aligned to longword boundary.

		move.b  (a1,d0.w),d6
		addx.l  d1,d2
		move.b  (a2,d5.l),d5
		and.w   d3,d2
		move.b  (a2,d6.l),d6
sw32_7		move.b  d5,SCREENWIDTH*3(a0)
		addx.l  d1,d0
		move.b  (a1,d2.w),d5
		and.w   d3,d0
sw22_7		move.b  d6,SCREENWIDTH*2(a0)

		add.l   a3,a0
.loop_end7
		dbf     d7,.loop7

		; it's faster to divide it to two lines on 060
		; and shouldn't be slower on 040.

;		move.b  (a1,d0.w),d6    ; new
;		move.b  (a2,d6.l),d6    ; new
;		move.b  d6,(a0)     ; new

.end7
		movem.l (sp)+,d2-d3/d5-d7/a2/a3
		rts

;-----------------------------------------------------------------------
		cnop    0,4

; 040 version

_R_DrawColumn_040
@R_DrawColumn_040
		movem.l d3-d4/d6-d7/a2/a3,-(sp)

		move.l  _dc_yh(a4),d7     ; count = _dc_yh - _dc_yl
		move.l  _dc_yl(a4),d0
		sub.l   d0,d7
		bmi.w   .end8

		movea.l	_ylookup2(a4),a0  ; dest = ylookup2[_dc_yl] + _dc_x
		move.l  (a0,d0.l*4),a0
		adda.l	_dc_x(a4),a0

		move.l  _dc_colormap(a4),d4
		move.l  _dc_source(a4),a1

		move.l  _dc_iscale(a4),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
		sub.l   _centery(a4),d0
		muls.l  d1,d0
		add.l   _dc_texturemid(a4),d0

		moveq   #$7f,d3
sw42_8		lea     (SCREENWIDTH*4).w,a3

; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac  (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH

		move.l  d7,d6
		and.w   #3,d6

		swap    d0              ; swap decimals and fraction
		swap    d1

		adda.w	.width_tab8(pc,d6.w*2),a0
		lsr.w   #2,d7
		move.w  .tmap_tab8(pc,d6.w*2),d6

		and.w   d3,d0
		sub.w   d1,d0
		add.l   d1,d0           ; setup the X flag

		jmp	.loop8(pc,d6.w)

		cnop    0,4
.width_tab8
swm30_8		dc.w    -3*SCREENWIDTH
swm20_8		dc.w    -2*SCREENWIDTH
swm10_8		dc.w    -1*SCREENWIDTH
		dc.w    0
.tmap_tab8
		dc.w    .08-.loop8
		dc.w    .18-.loop8
		dc.w    .28-.loop8
		dc.w    .38-.loop8
.loop8
.38
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		and.w   d3,d0
		move.b  (a2),(a0)
.28
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		and.w   d3,d0
sw12_8		move.b  (a2),SCREENWIDTH(a0)
.18
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		and.w   d3,d0
sw22_8		move.b  (a2),SCREENWIDTH*2(a0)
.08
		move.b  (a1,d0.w),d4
		addx.l  d1,d0
		move.l  d4,a2
		and.w   d3,d0
sw32_8		move.b  (a2),SCREENWIDTH*3(a0)

		adda.l	a3,a0
.loop_end8
		dbf d7,.loop8
.end8
		movem.l (sp)+,d3-d4/d6-d7/a2/a3
		rts

;-----------------------------------------------------------------------
; This faster version by Aki M Laukkanen <amlaukka@cc.helsinki.fi>

		cnop    0,4

_R_DrawSpan_060
@R_DrawSpan_060
		movem.l d2-d7/a2/a3,-(sp)
		move.l  (_ds_y),d0
		move.l  (_ds_x1),d1     ; dest = ylookup[_ds_y] + columnofs[_ds_x1]
		movea.l	(_ylookup2),a0	;      = ylookup2[_ds_y] + _ds_x1
		move.l  (a0,d0.l*4),a0
;;		movea.l	(_columnofs),a1
;;		add.l   (a1,d1.l*4),a0
		adda.l	d1,a0			; new
		move.l  (_ds_source),a1
		move.l  (_ds_colormap),a2
		move.l  (_ds_x2),d7     ; count = _ds_x2 - _ds_x1
		sub.l   d1,d7
		addq.l  #1,d7
		move.l  (_ds_xfrac),d0
		move.l  (_ds_yfrac),d1
		move.l  (_ds_xstep),d2
		move.l  (_ds_ystep),d3
		move.l  a0,d4
		btst    #0,d4
		beq.b     .skipb9
		move.l  d0,d5           ; do the unaligned pixels
		move.l  d1,d6           ; so we can write to longword
		swap    d5              ; boundary in the main loop
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		add.l   d2,d0
		move.b  (a2,d5.w),(a0)+
		add.l   d3,d1
		move.l  a0,d4
		subq.l  #1,d7
.skipb9		btst    #1,d4
		beq.b     .skips9
		moveq   #2,d4
		cmp.l   d4,d7
		bls.b   .skips9
		move.l  d0,d5           ; write two pixels
		move.l  d1,d6
		swap    d5
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		move.w  (a2,d5.w),d4
		add.l   d2,d0
		add.l   d3,d1
		move.l  d0,d5
		move.l  d1,d6
		swap    d5
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		move.b  (a2,d5.w),d4
		add.l   d2,d0
		move.w  d4,(a0)+
		add.l   d3,d1
		subq.l  #2,d7
.skips9		move.l  d7,d6           ; setup registers
		and.w   #3,d6
		move.l  d6,a3
		eor.w   d0,d1           ; swap fraction parts for addx
		eor.w   d2,d3
		eor.w   d1,d0
		eor.w   d3,d2
		eor.w   d0,d1
		eor.w   d2,d3
		swap    d0
		swap    d1
		swap    d2
		swap    d3
		lsl.w   #6,d1
		lsl.w   #6,d3
		moveq   #0,d6
		moveq   #0,d5
		sub.l   #$f000,a1
		lsr.l   #2,d7
		beq.w   .skip_loop29
		subq.l  #1,d7
		sub.w   d3,d1
		add.l   d3,d1           ; setup the X flag
		or.w    #$ffc0,d0
		or.w    #$f03f,d1
		move.w  d0,d6
		and.w   d1,d6
		bra.b   .start_loop29
		cnop    0,8
.loop29		or.w    #$ffc0,d0       ; pOEP
		or.w    #$f03f,d1       ; sOEP
		move.b  (a2,d5.l),d4    ; pOEP but allows sOEP
		move.w  d0,d6           ; sOEP
		and.w   d1,d6           ; pOEP
		move.l  d4,(a0)+        ; sOEP
.start_loop29
		addx.l  d2,d0           ; pOEP only
		addx.l  d3,d1           ; pOEP only
		move.b  (a1,d6.l),d5    ; pOEP but allows sOEP
		or.w    #$ffc0,d0       ; sOEP
		or.w    #$f03f,d1       ; pOEP
		move.w  d0,d6           ; sOEP
		move.w  (a2,d5.l),d4    ; pOEP but allows sOEP
		and.w   d1,d6           ; sOEP
		addx.l  d2,d0           ; pOEP only
		addx.l  d3,d1           ; pOEP only
		move.b  (a1,d6.l),d5    ; pOEP but allows sOEP
		or.w    #$ffc0,d0       ; sOEP
		or.w    #$f03f,d1       ; pOEP
		move.w  d0,d6           ; sOEP
		move.b  (a2,d5.l),d4    ; pOEP but allows sOEP
		and.w   d1,d6           ; sOEP
		addx.l  d2,d0           ; pOEP only
		addx.l  d3,d1           ; pOEP only
		move.b  (a1,d6.l),d5    ; pOEP but allows sOEP
		or.w    #$ffc0,d0       ; sOEP
		or.w    #$f03f,d1       ; pOEP
		move.w  d0,d6           ; sOEP
		swap    d4              ; pOEP only
		move.w  (a2,d5.l),d4    ; pOEP but allows sOEP
		and.w   d1,d6           ; sOEP
		addx.l  d2,d0           ; pOEP only
		addx.l  d3,d1           ; pOEP only
		move.b  (a1,d6.l),d5    ; pOEP but allows sOEP
		dbf     d7,.loop29      ; pOEP only = 7.75 cycles/pixel
		move.b  (a2,d5.l),d4
		move.l  d4,(a0)+
.skip_loop29
		sub.w   d3,d1
		add.l   d3,d1
		move.l  a3,d7
		bra.b     .loop_end29
.loop39  	or.w    #$ffc0,d0
		or.w    #$f03f,d1
		move.w  d0,d6
		and.w   d1,d6
		addx.l  d2,d0
		addx.l  d3,d1
		move.b  (a1,d6.l),d5
		move.b  (a2,d5.l),(a0)+
.loop_end29
		dbf     d7,.loop39
.end29   	movem.l (sp)+,d2-d7/a2/a3
		rts

		cnop    0,4

;-----------------------------------------------------------------------
; 030/040 version

_R_DrawSpan_040
@R_DrawSpan_040
		movem.l d2-d7/a2-a4,-(sp)
		move.l  _ds_y(a4),d0
		move.l  _ds_x1(a4),d1	; dest = ylookup[_ds_y] + columnofs[_ds_x1]
		movea.l	_ylookup2(a4),a0 ;     = ylookup2[_ds_y] + _ds_x1
		move.l  (a0,d0.l*4),a0
;;		movea.l	_columnofs(a4),a1
;;		add.l   (a1,d1.l*4),a0
		adda.l	d1,a0			; new
		move.l  _ds_source(a4),a1
		move.l  _ds_colormap(a4),a2
		move.l  _ds_x2(a4),d7	; count = _ds_x2 - _ds_x1
		sub.l   d1,d7
		addq.l  #1,d7
		move.l  _ds_xfrac(a4),d0
		move.l  _ds_yfrac(a4),d1
		move.l  _ds_xstep(a4),d2
		move.l  _ds_ystep(a4),d3
		move.l  a0,d4
		btst    #0,d4
		beq.b   .skipb0
		move.l  d0,d5           ; do the unaligned pixels
		move.l  d1,d6           ; so we can write to longword
		swap    d5              ; boundary in the main loop
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		add.l   d2,d0
		move.b  (a2,d5.w),(a0)+
		add.l   d3,d1
		move.l  a0,d4
		subq.l  #1,d7
.skipb0		btst    #1,d4
		beq.b   .skips0
		moveq   #2,d4
		cmp.l   d4,d7
		bls.b   .skips0
		move.l  d0,d5           ; write two pixels
		move.l  d1,d6
		swap    d5
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		move.w  (a2,d5.w),d4
		add.l   d2,d0
		add.l   d3,d1
		move.l  d0,d5
		move.l  d1,d6
		swap    d5
		swap    d6
		and.w   #$3f,d5
		and.w   #$3f,d6
		lsl.w   #6,d6
		or.w    d5,d6
		move.b  (a1,d6.w),d5
		move.b  (a2,d5.w),d4
		add.l   d2,d0
		move.w  d4,(a0)+
		add.l   d3,d1
		subq.l  #2,d7
.skips0		move.l  a2,d4
		add.l   #$1000,a1       ; catch 22
		move.l  a0,a3
		add.l   d7,a3
		move.l  d7,d5
		and.b   #~3,d5
		move.l  a0,a4
		add.l   d5,a4
		eor.w   d0,d1           ; swap fraction parts for addx
		eor.w   d2,d3
		eor.w   d1,d0
		eor.w   d3,d2
		eor.w   d0,d1
		eor.w   d2,d3
		swap    d0
		swap    d1
		swap    d2
		swap    d3
		lsl.w   #6,d1
		lsl.w   #6,d3
		move.w  #$ffc0,d6
		move.w  #$f03f,d7
		lsr.w   #2,d5
		beq.b   .skip_loop20
		sub.w   d2,d0
		add.l   d2,d0           ; setup the X flag
.loop20		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.w  (a2),d5
		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.b  (a2),d5
		swap    d5
		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.w  (a2),d5
		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.b  (a2),d5
		move.l  d5,(a0)+
		cmp.l   a0,a4
		bne.b   .loop20
.skip_loop20
		sub.w   d2,d0
		add.l   d2,d0

		bra.b   .loop_end20
.loop30		or.w    d6,d0
		or.w    d7,d1
		and.w   d1,d0
		addx.l  d3,d1
		move.b  (a1,d0.w),d4
		addx.l  d2,d0
		move.l  d4,a2
		move.b  (a2),(a0)+
.loop_end20
		cmp.l   a0,a3
		bne.b   .loop30
.end20		movem.l (sp)+,d2-d7/a2-a4
		rts

;-----------------------------------------------------------------------
		xref	_segtextured		; boolean
		xref	_markfloor		; boolean
		xref	_markceiling		; boolean
		xref	_maskedtexture		; boolean
		xref	_maskedtexturecol	; short *
		xref	_toptexture		; short
		xref	_bottomtexture		; short
		xref	_midtexture		; short
		xref	_rw_x			; int
		xref	_rw_stopx		; int
		xref	_rw_centerangle		; angle_t
		xref	_rw_offset		; fixed_t
		xref	_rw_distance		; fixed_t
		xref	_rw_scale		; fixed_t
		xref	_rw_scalestep		; fixed_t
		xref	_rw_midtexturemid	; fixed_t
		xref	_rw_toptexturemid	; fixed_t
		xref	_rw_bottomtexturemid	; fixed_t
		xref	_pixhigh		; fixed_t
		xref	_pixlow			; fixed_t
		xref	_pixhighstep		; fixed_t
		xref	_pixlowstep		; fixed_t
		xref	_topfrac		; fixed_t
		xref	_topstep		; fixed_t
		xref	_bottomfrac		; fixed_t
		xref	_bottomstep		; fixed_t
		xref	_walllights		; lighttable_t **
		xref	_ceilingclip		; short *
		xref	_ceilingplane		; visplane_t *
		xref	_floorclip		; short *
		xref	_floorplane		; visplane_t *
		xref	_xtoviewangle		; angle_t *
		xref	_finetangent		; fixed_t[]
		xref	_FixedMul
		xref	_colfunc
;;;		xref	@R_GetColumn

		cnop	0,4

_R_RenderSegLoop
@R_RenderSegLoop
		movem.l	d2-d7/a2/a3/a5/a6,-(sp)
		movea.l	_rw_x(a4),a2	; a2 = rw_x
		movea.l	_topfrac(a4),a3	; a3 = topfrac
		movea.l	_bottomfrac(a4),a5 ; a5 = bottomfrac
		movea.l	_rw_scale(a4),a6 ; a6 = rw_scale
		bra.w	1$		; for ( ; rw_x < rw_stopx ; rw_x++)

20$		move.l	a2,d0		; d0 = rw_x
		move.l	a3,d7		; d7 = topfrac
;;;		lea	_ceilingclip(a4),a0 ; a0 -> ceilingclip
		movea.l	_ceilingclip(a4),a0 ; a0 -> ceilingclip
		subq.l	#1,d7		; d7 = topfrac - 1
		move.w	(a0,d0.l*2),d3	; d3.w = ceilingclip[rw_x]
		asr.l	#8,d7		; d7 = (topfrac - 1) >> 8
		ext.l	d3		; d3 = ceilingclip[rw_x]
		asr.l	#4,d7		; d7 = (topfrac - 1) >> 12
		addq.l	#1,d3		; d3 = top = ceilingclip[rw_x] + 1
		addq.l	#1,d7		; d7 = yl = (topfrac + (1 << 12) - 1) >> 12
		cmp.l	d3,d7
		bge.b	2$
		move.l	d3,d7		; d7 = yl = ceilingclip[rw_x] + 1

2$		tst.l	_markceiling(a4) ; if (markceiling) {
		beq.b	3$
;;;		lea	_floorclip(a4),a1 ; a1 -> floorclip
		movea.l	_floorclip(a4),a1 ; a1 -> floorclip
		move.l	d7,d4		; d4 = yl
		move.w	(a1,d0.l*2),d1	; d1.w = floorclip[rw_x]
		subq.l	#1,d4		; d4 = bottom = yl - 1
		ext.l	d1		; d1 = floorclip[rw_x]
		cmp.l	d1,d4		; if (bottom >= floorclip[rw_x])
		blt.b	4$
		move.l	d1,d4
		subq.l	#1,d4		; d4 = bottom = floorclip[rw_x] - 1

4$		cmp.l	d4,d3		; if (top <= bottom)
		bgt.b	3$

;;;		movea.l	_ceilingplane(a4),a1
;;;		adda.l	d0,a1		; a1 -> ceilingplane->0[rw_x]
;;;		move.b	d3,$15(a1)	; ceilingplane->top[rw_x] = top
;;;		move.b	d4,$157(a1)	; ceilingplane->bottom[rw_x] = bottom

		movea.l	_ceilingplane(a4),a0
		movea.l	20(a0),a1
		move.w	d3,(a1,d0.l*2)	; ceilingplane->top[rw_x] = top
		movea.l	24(a0),a1
		move.w	d4,(a1,d0.l*2)	; ceilingplane->bottom[rw_x] = bottom

3$		move.l	a5,d3		; d3 = bottomfrac
;;;		lea	_floorclip(a4),a1	; a1 -> floorclip
		movea.l	_floorclip(a4),a1	; a1 -> floorclip
		asr.l	#8,d3		; d3 = bottomfrac >> 8
		move.w	(a1,d0.l*2),d1	; d1.w = floorclip[rw_x]
		asr.l	#4,d3		; d3 = yh = bottomfrac >> 12
		ext.l	d1		; d1 = floorclip[rw_x]
		cmp.l	d1,d3		; if (yh >= floorclip[rw_x])
		blt.b	5$
		move.l	d1,d3
		subq.l	#1,d3		; d3 = yh = floorclip[rw_x] - 1

5$		tst.l	_markfloor(a4)	; if (markfloor)
		beq.b	6$
		move.l	d3,d4		; d4 = yh
;;;		lea	_ceilingclip(a4),a0 ; a0 -> ceilingclip
		movea.l	_ceilingclip(a4),a0 ; a0 -> ceilingclip
		move.w	(a0,d0.l*2),d2	; d2.w = ceilingclip[rw_x]
		addq.l	#1,d4		; d4 = top = yh + 1
		ext.l	d2		; d2 = ceilingclip[rw_x]
		subq.l	#1,d1		; d1 = bottom = floorclip[rw_x] - 1
		cmp.l	d2,d4		; if (top <= ceilingclip[rw_x])
		bgt.b	7$
		move.l	d2,d4
		addq.l	#1,d4		; d4 = top = ceilingclip[rw_x] + 1
7$		cmp.l	d1,d4		; if (top <= bottom)
		bgt.b	6$

;;;		movea.l	_floorplane(a4),a1
;;;		adda.l	d0,a1		; a1 -> floorplane->0[rw_x]
;;;		move.b	d4,$15(a1)	; floorplane->top[rw_x] = top
;;;		move.b	d1,$157(a1)	; floorplane->bottom[rw_x] = bottom

		movea.l	_floorplane(a4),a0
		movea.l	20(a0),a1
		move.w	d4,(a1,d0.l*2)	; ceilingplane->top[rw_x] = top
		movea.l	24(a0),a1
		move.w	d1,(a1,d0.l*2)	; ceilingplane->bottom[rw_x] = bottom

6$		tst.l	_segtextured(a4) ; if (segtextured)
		beq.b	8$
;;;		lea	_xtoviewangle(a4),a0 ; a0 -> xtoviewangle
		movea.l	_xtoviewangle(a4),a0 ; a0 -> xtoviewangle
		move.l	_rw_centerangle(a4),d1
		add.l	(a0,d0.l*4),d1	; d1 = rw_centerangle + xtoviewangle[rw_x]
		swap	d1		; d1 = angle
		lea	(_finetangent),a0 ; a0 -> finetangent
		lsr.w	#3,d1
		move.l	(a0,d1.w*4),d0	; d0 = finetangent[angle]
		movea.l	_FixedMul(a4),a0
		move.l	_rw_distance(a4),d1
		jsr	(a0)		; d0 = FixedMul(finetangent[angle],rw_distance)
		move.l	_rw_offset(a4),d5
		move.l	a6,d4		; d4 = rw_scale
		sub.l	d0,d5		; d5 = rw_offset-FixedMul(finetangent[angle],rw_distance)
		asr.l	#8,d4
		swap	d5		; d5.w = texturecolumn >>= 16
		asr.l	#4,d4		; d4 = index = rw_scale >> 12
		ext.l	d5		; d5 = texturecolumn
		moveq	#$30,d2		; d2 = MAXLIGHTSCALE = $30
		cmp.l	d2,d4		; if (index >= MAXLIGHTSCALE)
		bcs.b	9$
		moveq	#$2f,d4		; d4 = index = MAXLIGHTSCALE - 1
9$		movea.l	_walllights(a4),a0
		moveq	#-1,d0		; d0 = $ffffffff
		move.l	(a0,d4.l*4),_dc_colormap(a4) ; dc_colormap = walllights[index]
		move.l	a2,_dc_x(a4)	; dc_x = rw_x
		move.l	a6,d1		; d1 = rw_scale
		divu.l	d1,d0
		move.l	d0,_dc_iscale(a4) ; dc_iscale = $ffffffff / rw_scale

8$		move.l	_midtexture(a4),d0 ; if (midtexture)
		beq.b	10$
		move.l	d7,_dc_yl(a4)	; dc_yl = yl
		move.l	d3,_dc_yh(a4)	; dc_yh = yh
		move.l	_rw_midtexturemid(a4),_dc_texturemid(a4)
		move.l	d5,d1		; d1 = texturecolumn
		jsr	(@R_GetColumn)
		move.l	d0,_dc_source(a4) ; dc_source = R_GetColumn(midtexture,texturecolumn)
		movea.l	_colfunc(a4),a0
		jsr	(a0)		; colfunc()
		move.l	a2,d0		; d0 = rw_x
		move.l	_viewheight(a4),d1 ; d1 = viewheight
;;;		lea	_ceilingclip(a4),a0
		movea.l	_ceilingclip(a4),a0
		move.w	d1,(a0,d0.l*2)	; ceilingclip[rw_x] = viewheight
;;;		lea	_floorclip(a4),a0
		movea.l	_floorclip(a4),a0
		move.w	#$ffff,(a0,d0.l*2) ; floorclip[rw_x] = -1
		bra.w	11$

10$		move.l	_toptexture(a4),d0 ; if (toptexture)
		beq.b	12$
		move.l	_pixhighstep(a4),d1 ; d1 = pixhighstep
		move.l	_pixhigh(a4),d2	; d2 = pixhigh
		add.l	d1,_pixhigh(a4)	; pixhigh += pixhighstep
;;;		lea	_floorclip(a4),a0 ; a0 -> floorclip
		movea.l	_floorclip(a4),a0 ; a0 -> floorclip
		asr.l	#8,d2		; d2 = pixhigh >> 8
		move.l	a2,d1		; d1 = rw_x
		move.w	(a0,d1.l*2),d1	; d1.w = floorclip[rw_x]
		asr.l	#4,d2		; d2 = pixhigh >> 12
		ext.l	d1		; d1 = floorclip[rw_x]
		move.l	d2,d6		; d6 = mid = pixhigh >> 12
		cmp.l	d1,d6		; if (mid >= floorclip[rw_x])
		blt.b	13$
		move.l	d1,d6
		subq.l	#1,d6		; d6 = mid = floorclip[rw_x] - 1
13$		cmp.l	d7,d6		; if (mid >= yl)
		blt.b	14$
		move.l	d7,_dc_yl(a4)	; dc_yl = yl
		move.l	d6,_dc_yh(a4)	; dc_yh = mid
		move.l	_rw_toptexturemid(a4),_dc_texturemid(a4)
		move.l	d5,d1		; d1 = texturecolumn, d0 = toptexture
		jsr	(@R_GetColumn)
		move.l	d0,_dc_source(a4)	; dc_source = R_GetColumn(d0,d1)
		movea.l	_colfunc(a4),a0
		jsr	(a0)		; colfunc()
		move.l	a2,d0		; d0 = rw_x
;;;		lea	_ceilingclip(a4),a0
		movea.l	_ceilingclip(a4),a0
		move.w	d6,(a0,d0.l*2)	; ceilingclip[rw_x] = mid
		bra.b	15$

12$		tst.l	_markceiling(a4) ; else if (markceiling)
		beq.b	15$
14$		subq.l	#1,d7		; d7 = yl - 1
		move.l	a2,d0		; d0 = rw_x
;;;		lea	_ceilingclip(a4),a0
		movea.l	_ceilingclip(a4),a0
		move.w	d7,(a0,d0.l*2)	; ceilingclip[rw_x] = yl - 1

15$		move.l	_bottomtexture(a4),d0 ; if (bottomtexture)
		beq.b	16$
		move.l	_pixlow(a4),d6	; d6 = pixlow
		move.l	d6,d1		; d1 = pixlow
;;;		lea	_ceilingclip(a4),a0 ; a0 -> ceilingclip
		movea.l	_ceilingclip(a4),a0 ; a0 -> ceilingclip
		add.l	_pixlowstep(a4),d1 ; d1 = pixlow + pixlowstep
		subq.l	#1,d6		; d6 = pixlow - 1
		move.l	d1,_pixlow(a4)	; pixlow += pixlowstep
		asr.l	#8,d6		; d6 = (pixlow - 1) >> 8
		move.l	a2,d1		; d1 = rw_x
		asr.l	#4,d6		; d6 = (pixlow - 1) >> 12
		move.w	(a0,d1.l*2),d1	; d1.w = ceilingclip[rw_x]
		addq.l	#1,d6		; d6 = mid = (pixlow + (1 << 12) - 1) >> 12
		ext.l	d1		; d1 = ceilingclip[rw_x]
		cmp.l	d1,d6		; if (mid <= ceilingclip[rw_x])
		bgt.b	17$
		move.l	d1,d6
		addq.l	#1,d6		; d6 = mid = ceilingclip[rw_x] + 1
17$		cmp.l	d3,d6		; if (mid <= yh)
		bgt.b	18$
		move.l	d6,_dc_yl(a4)	; dc_yl = mid
		move.l	d3,_dc_yh(a4)	; dc_yh = yh
		move.l	_rw_bottomtexturemid(a4),_dc_texturemid(a4)
		move.l	d5,d1		; d1 = texturecolumn, d0 = bottomtexture
		jsr	(@R_GetColumn)
		move.l	d0,_dc_source(a4) ; dc_source = R_GetColumn(d0,d1)
		movea.l	_colfunc(a4),a0
		jsr	(a0)		; colfunc ()
		move.l	a2,d0		; d0 = rw_x
;;;		lea	_floorclip(a4),a0
		movea.l	_floorclip(a4),a0
		move.w	d6,(a0,d0.l*2)	; floorclip[rw_x] = mid
		bra.b	19$

16$		tst.l	_markfloor(a4)	; else if (markfloor)
		beq.b	19$
18$		addq.l	#1,d3		; d3 = yh + 1
		move.l	a2,d0		; d0 = rw_x
;;;		lea	_floorclip(a4),a0
		movea.l	_floorclip(a4),a0
		move.w	d3,(a0,d0.l*2)	; floorclip[rw_x] = yh + 1

19$		tst.l	_maskedtexture(a4) ; if (maskedtexture)
		beq.b	11$
		move.l	a2,d0		; d0 = rw_x
		movea.l	_maskedtexturecol(a4),a0
		move.w	d5,(a0,d0.l*2)	; maskedtexturecol[rw_x] = texturecolumn

11$		adda.l	_rw_scalestep(a4),a6 ; rw_scale += rw_scalestep
		adda.l	_topstep(a4),a3	; topfrac += topstep
		adda.l	_bottomstep(a4),a5 ; bottomfrac += bottomstep
		addq.l	#1,a2		; rw_x++

1$		cmpa.l	_rw_stopx(a4),a2
		blt.w	20$

		move.l	a2,_rw_x(a4)
		move.l	a3,_topfrac(a4)
		move.l	a5,_bottomfrac(a4)
		move.l	a6,_rw_scale(a4)

		movem.l	(sp)+,d2-d7/a2/a3/a5/a6
		rts

;-----------------------------------------------------------------------
; R_MakeSpans (in r_plane.c) by Arto Huusko <arto.huusko@pp.qnet.fi>

		xref	_spanstart
;;;		xref	@R_MapPlane
;void
;__asm R_MakeSpans
;( register __d2 int x,
;  register __d3 int t1,
;  register __d4 int b1,
;  register __d5 int t2,
;  register __d6 int b2 );

		cnop	0,4
@R_MakeSpans:
_R_MakeSpans:
;First comments are for the non-__asm version of func prototype

;		movem.l	d2-d7/a2/a3,-(sp)
		movem.l	d3-d7/a2/a3,-(sp)

;		move.l	36(sp),d4	;(rest of args come in stack) int b1
;		move.l	d0,d2		;int x
;		move.l	40(sp),d5	;int t2
;		move.l	d1,d3		;int t1
;		move.l	44(sp),d6	;int b2

;;; 		move.l	#_spanstart,a2
		movea.l	_spanstart(a4),a2
		move.l	d2,-(sp)
		subq.l	#1,(sp)	;x-1, for R_MapPlane (third argument, thus in stack)

		;D2=X D3=T1 D4=B1 D5=T2 D6=B2, (sp)=X-1

		;prepare while(t1 < t2 && t1<=b1)
		;Calculate just how many times the loop is done, so no need
		; to compare everytime
		move.l	d5,d7
		sub.l	d3,d7
		ble.b	.rmsl1_Done	;T1=>T2, therefore loop not done
		subq.l	#1,d7
		move.l	d4,d0
		sub.l	d3,d0
;If T1>B1 first loop is not done. The same rule also applies to
;second loop (first: t1<=b1 second: b1>=t1) so we skip both loops.
		bmi.b	.rmsl2_Done

		lea	(a2,d3.l*4),a3
		cmp.l	d0,d7	;Do the loop until the smaller delta is zero
		;smaller because of AND operation in while statement
		bmi.b	.rms_Loop1
		move.l	d0,d7	;t1<=b1 effective

.rms_Loop1_2:	;Another loop...
		move.l	(a3)+,d1
		move.l	d3,d0
		jsr	(@R_MapPlane)
		addq.l	#1,d3
		dbf	d7,.rms_Loop1_2
		bra.b	.rmsl2_Done	;..so we can quickly skip
		;loop #2. The loop was done with t1<=b1 and in the
		;end this is not true. Also loop 2 uses the same comparison
		;it is automatically false thus we can skip it.

		cnop	0,4
.rms_Loop1:
		move.l	(a3)+,d1
		move.l	d3,d0
		jsr	(@R_MapPlane)
		addq.l	#1,d3
		dbf	d7,.rms_Loop1

.rmsl1_Done:

		move.l	d4,d0
		sub.l	d3,d0
		bmi.b	.rmsl2_Done
		move.l	d4,d7
		sub.l	d6,d7
		ble.b	.rmsl2_Done
		subq.l	#1,d7

		lea	4(a2,d4.l*4),a3
		cmp.l	d0,d7
		bmi.b	.rms_Loop2
		move.l	d0,d7	;b1>=t1 effective

.rms_Loop2_2:
		move.l	-(a3),d1
		move.l	d4,d0
		jsr	(@R_MapPlane)
		subq.l	#1,d4
		dbf	d7,.rms_Loop2_2

		move.l	d3,d7
		sub.l	d5,d7
		bhi.b	.rms_DoL3

		bra.w	.rmsl4_Done

		cnop	0,4
.rms_Loop2:
		move.l	-(a3),d1
		move.l	d4,d0
		jsr	(@R_MapPlane)
		subq.l	#1,d4
		dbf	d7,.rms_Loop2

.rmsl2_Done:
	;The following copy loops (spanstart[??]=x;) are optimised
	;by assuming that the loop is executed several (more then four) times

		move.l	d3,d7
		sub.l	d5,d7
		ble.b	.rmsl3_Done
.rms_DoL3:
		subq.l	#1,d7
		move.l	d6,d0
		sub.l	d5,d0
		bmi.w	.rmsl4_Done

		lea		(a2,d5.l*4),a3
		cmp.l	d0,d7
		bmi.b	.rms_StartLoop3

		move.l	d2,(a3)+
		bclr	#0,d0
		beq.b	.rmsl32_Pass1
		move.l	d2,(a3)+
.rmsl32_Pass1:
		bclr	#1,d0
		beq.b	.rmsl32_Pass2
		move.l	d2,(a3)+
		move.l	d2,(a3)+
.rmsl32_Pass2:
		tst.l	d0	
		beq.b	.rmsl4_Done
.rms_Loop3_2:
		move.l	d2,(a3)+
		move.l	d2,(a3)+
		move.l	d2,(a3)+
		move.l	d2,(a3)+
		subq.l	#4,d0
		bgt.b	.rms_Loop3_2
		bra.b	.rmsl4_Done

		cnop	0,4

.rms_StartLoop3:
		move.l	d2,(a3)+	;always at least once

		bclr	#0,d7
		beq.b	.rmsl3_Pass1
		move.l	d2,(a3)+
.rmsl3_Pass1:
		bclr	#1,d7
		beq.b	.rmsl3_Pass2
		move.l	d2,(a3)+
		move.l	d2,(a3)+
.rmsl3_Pass2:
		tst.l	d7
		beq.b	.rmsl3_Done
.rms_Loop3:
		move.l	d2,(a3)+
		move.l	d2,(a3)+
		move.l	d2,(a3)+
		move.l	d2,(a3)+
		addq.l	#4,d5	;saves for addq per loop
		subq.l	#4,d7	;also saves four dbfs, though adds here...
		bgt.b	.rms_Loop3

.rmsl3_Done:

		move.l	d6,d0
		sub.l	d5,d0
		bmi.b	.rmsl4_Done
		move.l	d6,d7
		sub.l	d4,d7
		ble.b	.rmsl4_Done
		subq.l	#1,d7

		lea	4(a2,d6.l*4),a3
		cmp.l	d0,d7
		bmi.b	.rms_StartLoop4
		move.l	d0,d7

.rms_StartLoop4:
		move.l	d2,-(a3)	;always at least once

		bclr	#0,d7
		beq.b	.rmsl4_Pass1
		move.l	d2,-(a3)
.rmsl4_Pass1:
		bclr	#1,d7
		beq.b	.rmsl4_Pass2
		move.l	d2,-(a3)
		move.l	d2,-(a3)
.rmsl4_Pass2:
		tst.l	d7
		beq.b	.rmsl4_Done
.rms_Loop4:
		move.l	d2,-(a3)
		move.l	d2,-(a3)
		move.l	d2,-(a3)
		move.l	d2,-(a3)
		subq.l	#4,d7
		bgt.b	.rms_Loop4

.rmsl4_Done:
		addq.l	#4,sp
		movem.l	(sp)+,d3-d7/a2/a3
;		movem.l	(sp)+,d2-d7/a2/a3
		rts


;void R_MakeSpans(int x, int t1, int b1, int t2, int b2)
;{
;
;    while (t1 < t2 && t1<=b1)
;    {
;		R_MapPlane (t1,spanstart[t1],x-1);
;		t1++;
;    }
;    while (b1 > b2 && b1>=t1)
;    {
;		R_MapPlane (b1,spanstart[b1],x-1);
;		b1--;
;    }
;
;    while (t2 < t1 && t2<=b2)
;    {
;		spanstart[t2] = x;
;		t2++;
;    }
;    while (b2 > b1 && b2>=t2)
;    {
;		spanstart[b2] = x;
;		b2--;
;    }
;}

;-----------------------------------------------------------------------
; R_DrawPlanes (in r_plane.c) by Arto Huusko <arto.huusko@pp.qnet.fi>

;		STRUCTURE	visplane,0
;
;		 LONG	height
;		 LONG	picnum
;		 LONG	lightlevel
;		 LONG	minx
;		 LONG	maxx
;		 PTR	top
;		 PTR	bottom
;
;		LABEL	visplane_size

height		equ	0		; fixed
picnum		equ	4		; int
lightlevel	equ	8		; int
minx		equ	12		; int
maxx		equ	16		; int
top		equ	20		; unsigned short* ([-1..SCREENWIDTH])
bottom		equ	24		; unsigned short* ([-1..SCREENWIDTH])
visplane_size	equ	28

		xref	_visplanes		; FAR visplane_t[]
		xref	_skyflatnum		; int
		xref	_flattranslation	; int*
		xref	_firstflat		; int
		xref	_viewz			; fixed_t
		xref	_planeheight		; fixed_t
		xref	_zlight	; FAR lighttable_t *[LIGHTLEVELS][MAXLIGHTZ]
		xref	_planezlight		; lighttable_t**
		xref	_extralight		; int
		xref	_lastvisplane		; visplane_t*
;;;		xref	_pspriteiscale		; fixed_t
		xref	_pspriteiscale2		; fixed_t
		xref	_detailshift		; int
		xref	_skytexturemid		; int
		xref	_viewangle		; angle_t
		xref	_skytexture		; int

;		xref	@W_CacheLumpNum
		xref	@Z_ChangeTag2

		cnop	0,4
@R_DrawPlanes:
_R_DrawPlanes:
		movem.l	d2-d7/a2/a3/a5/a6,-(sp)

		move.l	#_visplanes,a2

.rd_Loop:
		move.l	minx(a2),d2
		move.l	maxx(a2),d3		;These ones are used later, too
		cmp.l	d2,d3
		bmi.w	.rd_Next		;minx>maxx -> next loop

;;;		move.l	a2,a3
;;;		add.l	d2,a3		; a3 -> pl + minx
;;;		move.l	a3,a5		; a5 -> pl + minx
;;;		add.l	#top,a3		; a3 -> pl + minx + top
;;;		add.l	#bottom-1,a5	; a5 -> pl + minx + bottom - 1

		movea.l	top(a2),a3
		lea	(a3,d2.l*2),a3	; a3 -> pl->top[minx]
		movea.l	bottom(a2),a5
		lea	-2(a5,d2.l*2),a5 ; a5 -> pl->bottom[minx-1]

		move.l	picnum(a2),d1	; used if not sky...
		cmp.l	_skyflatnum(a4),d1
		beq.w	.rdl_Sky

		move.l	_flattranslation(a4),a0
		move.l	_firstflat(a4),d0
		add.l	(a0,d1.l*4),d0	; D1 contains picnum, A0 array of ints
		moveq	#1,d1		; PU_STATIC
		jsr	(@W_CacheLumpNum)
		move.l	d0,_ds_source(a4)

		move.l	height(a2),d0
		sub.l	_viewz(a4),d0
		bpl.b	.rdl_HP		; These two lines are equal to
		neg.l	d0		; abs() or iabs(). Branch if =>0, otherwise switch sign
.rdl_HP:
		move.l	d0,_planeheight(a4)
		move.l	lightlevel(a2),d0
		lsr.l	#4,d0	;LIGHTSEGSHIFT
		move.l	#_zlight,_planezlight(a4)
		add.l	_extralight(a4),d0

		cmp.l	#0,d0
		bmi.b	.rdl_LightDone	; lightlevel 0, no need to change _planezlight anymore
		cmp.l	#16,d0		; 16=LIGHTLEVELS
		bmi.b	.rdl_LightOK
		add.l	#7680,_planezlight(a4)	;15<<9 ((15<<7)*4)
		bra.b	.rdl_LightDone
.rdl_LightOK:
		lsl.l	#8,d0
		add.l	d0,d0
		add.l	d0,_planezlight(a4)

.rdl_LightDone:
;;;		move.b	#$FF,top+1(a2,d3.l)	;D3=maxx, top+1 ->top[maxx+1]
;;;		move.b	#$FF,-(a3)
		movea.l	top(a2),a0
		move.w	#$FFFF,2(a0,d3.l*2)	;D3=maxx, top+1 ->top[maxx+1]
		move.w	#$FFFF,-(a3)

		;x=pl->minx=d2
		move.l	d3,d7
		sub.l	d2,d7
		addq.l	#1,d7	;d7 = x<=stop

		moveq	#0,d3
		moveq	#0,d4
		moveq	#0,d5
		moveq	#0,d6

.rdl_MSLoop:
;;;		move.b	(a3)+,d3	; bumps to top[x] at the same time!
;;;		move.b	(a3),d5
;;;		move.b	(a5)+,d4
;;;		move.b	(a5),d6
		move.w	(a3)+,d3	; bumps to top[x] at the same time!
		move.w	(a3),d5
		move.w	(a5)+,d4
		move.w	(a5),d6
		jsr	(@R_MakeSpans)	; passes d2/d3/d4/d5/d6
		addq.l	#1,d2
		dbf	d7,.rdl_MSLoop

		move.l	_ds_source(a4),a0
		moveq	#101,d0		; PU_STATIC
		jsr	(@Z_ChangeTag2)

.rd_Next:
		add.l	#visplane_size,a2
		cmp.l	_lastvisplane(a4),a2	; pl<lastvisplane
		bmi.w	.rd_Loop

		movem.l	(sp)+,d2-d7/a2/a3/a5/a6

		rts

		cnop	0,4
.rdl_Sky:
;;; 		move.l	_pspriteiscale(a4),d0
;;; 		move.l	_detailshift(a4),d1
;;; 		asr.l	d1,d0
		move.l	_pspriteiscale2(a4),d0
		move.l	_colormaps(a4),_dc_colormap(a4)
		move.l	d0,_dc_iscale(a4)
		move.l	_skytexturemid(a4),_dc_texturemid(a4)

		sub.l	d2,d3	; maxx-minx == maxx-x, how many till x>maxx
		moveq	#0,d5
		moveq	#0,d6
;;;		lea	_xtoviewangle(a4),a6
		movea.l	_xtoviewangle(a4),a6
		lea	(a6,d2.l*4),a6	; a6 -> xtoviewangle[minx]
		moveq	#22,d7		; ANGLETOSKYSHIFT
		move.l	d2,_dc_x(a4)	; dc_x = minx
;;;		addq.l	#1,a5
		addq.l	#2,a5		; a5 -> pl->bottom[minx]

.rdl_SkyLoop:
;;;		move.b	(a3)+,d5
;;;		move.b	(a5)+,d6
		move.w	(a3)+,d5	; dc_yl = pl->top[x]
		move.w	(a5)+,d6	; dc_yh = pl->bottom[x]
		move.l	d5,_dc_yl(a4)
		move.l	d6,_dc_yh(a4)

		move.l	(a6)+,d1	; To keep values consistent if not drawn
		addq.l	#1,_dc_x(a4)	; dc_x = x

; the next 2 lines may be unnecessary --- but who knows?
		cmp.l	d5,d6
		bmi.b	.rdlsl_Next	; needless? I think so...

		add.l	_viewangle(a4),d1	; viewangle + xtoviewangle[x]
		move.l	_skytexture(a4),d0
		asr.l	d7,d1		; angle
		jsr	(@R_GetColumn)	; R_GetColumn(skytexture,angle)
		move.l	d0,_dc_source(a4)
		move.l	_colfunc(a4),a0
		jsr	(a0)

.rdlsl_Next:
		dbf	d3,.rdl_SkyLoop
		bra.b	.rd_Next


; void R_DrawPlanes (void)
; {
;     visplane_t*		pl;
;     int			light;
;     int			x;
;     int			stop;
;     int			angle;
; 
;     for (pl = visplanes ; pl < lastvisplane ; pl++)
;     {
; 	if (pl->minx > pl->maxx)
; 	    continue;
; 
; 	
; 	// sky flat
; 	if (pl->picnum == skyflatnum)
; 	{
; 	    dc_iscale = pspriteiscale2/*>>detailshift*/;
; 	    
; 	    // Sky is allways drawn full bright,
; 	    //  i.e. colormaps[0] is used.
; 	    // Because of this hack, sky is not affected
; 	    //  by INVUL inverse mapping.
; 	    dc_colormap = colormaps;
; 	    dc_texturemid = skytexturemid;
; 	    for (x=pl->minx ; x <= pl->maxx ; x++)
; 	    {
; 			dc_yl = pl->top[x];
; 			dc_yh = pl->bottom[x];
; 
; 			if (dc_yl <= dc_yh)
; 			{
; 			    angle = (viewangle + xtoviewangle[x])>>ANGLETOSKYSHIFT;
; 			    dc_x = x;
; 			    dc_source = R_GetColumn(skytexture, angle);
; 			    colfunc ();
; 			}
; 	    }
; 	    continue;
; 	}
; 	
; 	// regular flat
; 	ds_source = W_CacheLumpNum(firstflat +
; 				   flattranslation[pl->picnum],
; 				   PU_STATIC);
; 	
; 	planeheight = iabs(pl->height-viewz);
; 	light = (pl->lightlevel >> LIGHTSEGSHIFT)+extralight;
; 
; 	if (light >= LIGHTLEVELS)
; 	    light = LIGHTLEVELS-1;
; 
; 	if (light < 0)
; 	    light = 0;
; 
; 	planezlight = zlight[light];
; 
;	//pl->top[pl->maxx+1] = 0xff;
;	//pl->top[pl->minx-1] = 0xff;
;	pl->top[pl->maxx+1] = 0xffff;
;	pl->top[pl->minx-1] = 0xffff;
; 		
; 	stop = pl->maxx + 1;
; 
; 	for (x=pl->minx ; x<= stop ; x++)
; 	{
; 	    R_MakeSpans(x,pl->top[x-1],
; 			pl->bottom[x-1],
; 			pl->top[x],
; 			pl->bottom[x]);
; 	}
; 	
; 	Z_ChangeTag (ds_source, PU_CACHE);
;     }
; }


;-----------------------------------------------------------------------
; R_DrawMaskedColumn (in r_things.c) by Arto Huusko <arto.huusko@pp.qnet.fi>

		xref	_sprtopscreen	;fixed_t
		xref	_spryscale	;fixed_t
		xref	_mfloorclip	;short*
		xref	_mceilingclip	;short*

;void R_DrawMaskedColumn (column_t* column)

;column_t=post_t=
;	byte	topdelta
;	byte	length

		cnop	0,4

_R_DrawMaskedColumn:
@R_DrawMaskedColumn:

		cmp.b	#$FF,(a0)
		beq.w	.rd_Exit

		movem.l	d2-d6/a2/a3,-(sp)

		move.l	_dc_x(a4),d0
		moveq	#0,d3
		move.l	_mfloorclip(a4),a1
		move.w	(a1,d0.l*2),d3
		move.l	_mceilingclip(a4),a1
		move.w	(a1,d0.l*2),d4
		ext.l	d4

		move.l	_dc_texturemid(a4),d6	;basetexturemid
		move.l	_spryscale(a4),d5
		move.l	_colfunc(a4),a3

		move.l	a0,a2		;column talteen
.rd_Loop2:
		moveq	#0,d2
		move.b	(a2),d1
		extb.l	d1
		move.b	1(a2),d2
		muls.l	d5,d1
		mulu.l	d5,d2
		add.l	_sprtopscreen(a4),d1
		add.l	d1,d2

		add.l	#FRACUNIT-1,d1
		swap	d1		;>>FRACBITS
		ext.l	d1

		subq.l	#1,d2
		swap	d2
		and.l	#$FFFF,d2

		cmp.w	d3,d2
		bmi.b	.rd_yhok
		move.l	d3,d2
		subq.l	#1,d2
.rd_yhok:
		cmp.l	d1,d4
		bmi.b	.rd_ylok
		move.l	d4,d1
		addq.l	#1,d1
.rd_ylok:

		move.l	d1,_dc_yl(a4)
		move.l	d2,_dc_yh(a4)

		cmp.w	d1,d2
		bmi.b	.rd_skip

		move.l	a2,_dc_source(a4)
		addq.l	#3,_dc_source(a4)

		move.l	d6,_dc_texturemid(a4)
		moveq	#0,d0
		move.b	(a2),d0
		swap	d0
		sub.l	d0,_dc_texturemid(a4)
		jsr	(a3)
.rd_skip:
		moveq	#4,d0
		add.b	1(a2),d0
		add.l	d0,a2

		cmp.b	#$FF,(a2)
		bne.b	.rd_Loop2

		move.l	d6,_dc_texturemid(a4)

		movem.l	(sp)+,d2-d6/a2/a3

.rd_Exit:
		rts


;//
;// R_DrawMaskedColumn
;// Used for sprites and masked mid textures.
;// Masked means: partly transparent, i.e. stored
;//  in posts/runs of opaque pixels.
;//
;short*		mfloorclip;
;short*		mceilingclip;
;
;fixed_t		spryscale;
;fixed_t		sprtopscreen;
;
;void R_DrawMaskedColumn (column_t* column)
;{
;    int		topscreen;
;    int 	bottomscreen;
;    fixed_t	basetexturemid;
;	
;    basetexturemid = dc_texturemid;
;	
;    for ( ; column->topdelta != 0xff ; ) 
;    {
;	// calculate unclipped screen coordinates
;	//  for post
;	topscreen = sprtopscreen + spryscale*column->topdelta;
;	bottomscreen = topscreen + spryscale*column->length;
;
;	dc_yl = (topscreen+FRACUNIT-1)>>FRACBITS;
;	dc_yh = (bottomscreen-1)>>FRACBITS;
;		
;	if (dc_yh >= mfloorclip[dc_x])
;	    dc_yh = mfloorclip[dc_x]-1;
;	if (dc_yl <= mceilingclip[dc_x])
;	    dc_yl = mceilingclip[dc_x]+1;
;
;	if (dc_yl <= dc_yh)
;	{
;	    dc_source = (byte *)column + 3;
;	    dc_texturemid = basetexturemid - (column->topdelta<<FRACBITS);
;	    // dc_source = (byte *)column + 3 - column->topdelta;
;
;	    // Drawn by either R_DrawColumn
;	    //  or (SHADOW) R_DrawFuzzColumn.
;	    colfunc ();	
;	}
;	column = (column_t *)(  (byte *)column + column->length + 4);
;    }
;	
;    dc_texturemid = basetexturemid;
;}

;-----------------------------------------------------------------------
; R_MapPlane (in r_planes.c) by Arto Huusko <arto.huusko@pp.qnet.fi>

		xref	_cachedheight	;fixed_t*
		xref	_cacheddistance	;fixed_t*
		xref	_yslope		;fixed_t*
		xref	_cachedxstep	;fixed_t*
		xref	_cachedystep	;fixed_t*
		xref	_basexscale	;fixed_t
		xref	_baseyscale	;fixed_t
		xref	_distscale	;fixed_t*
		xref	_viewx		;fixed_t
		xref	_viewy		;fixed_t
		xref	_finecosine	;fixed_t*
		xref	_finesine	;FAR int []
		xref	_fixedcolormap	;lighttable_t*

		xref	_spanfunc

MAXLIGHTZ	equ	128
LIGHTZSHIFT	equ	20
ANGLETOFINESHIFT	equ	19

;void
;R_MapPlane
;( int		y,
;  int		x1,
;  int		x2 )

		cnop	0,4
@R_MapPlane:
_R_MapPlane:
		movem.l	d3/d4/d7,-(sp)

		move.l	d0,d3		;Y
		move.l	d1,d4		;X1

		move.l	_FixedMul(a4),a1	;Prepare function. amiga_fixed.s does not use A1!
		move.l	_planeheight(a4),d0	;ready for cache...
;;;		move.l	#_cachedheight,a0
		movea.l	_cachedheight(a4),a0
		cmp.l	(a0,d3.l*4),d0
		beq.b	.rm_If2				;ah, we are cached!

		move.l	d0,(a0,d3.l*4)
;;;		move.l	#_yslope,a0
		movea.l	_yslope(a4),a0
		move.l	(a0,d3.l*4),d1
		jsr	(a1)
		move.l	d0,d7			;save distance
;;;		move.l	#_cacheddistance,a0
		movea.l	_cacheddistance(a4),a0
		move.l	d0,(a0,d3.l*4)
		move.l	_basexscale(a4),d1
		jsr	(a1)
		move.l	d0,_ds_xstep(a4)
;;;		move.l	#_cachedxstep,a0
		movea.l	_cachedxstep(a4),a0
		move.l	d0,(a0,d3.l*4)
		move.l	_baseyscale(a4),d1
		move.l	d7,d0
		jsr	(a1)
		move.l	d0,_ds_ystep(a4)
;;;		move.l	#_cachedystep,a0
		movea.l	_cachedystep(a4),a0
		move.l	d0,(a0,d3.l*4)
		bra.b	.rm_1Done

.rm_If2:
;;; 		move.l	#_cacheddistance,a0
		movea.l	_cacheddistance(a4),a0
		move.l	(a0,d3.l*4),d7
;;;		move.l	#_cachedxstep,a0
		movea.l	_cachedxstep(a4),a0
		move.l	(a0,d3.l*4),_ds_xstep(a4)
;;;		move.l	#_cachedystep,a0
		movea.l	_cachedystep(a4),a0
		move.l	(a0,d3.l*4),_ds_ystep(a4)

.rm_1Done:
		move.l	d3,_ds_y(a4)
		move.l	d4,_ds_x1(a4)

		move.l	d7,d0
;;;		move.l	#_distscale,a0
		movea.l	_distscale(a4),a0
		move.l	(a0,d4.l*4),d1
		jsr	(a1)
		move.l	d0,d3	;Y not needed anymore

;;;		lea	_xtoviewangle(a4),a0
		movea.l	_xtoviewangle(a4),a0
		move.l	(a0,d4.l*4),d4		;x1 not needed anymore
		add.l	_viewangle(a4),d4
		moveq	#ANGLETOFINESHIFT,d1
		lsr.l	d1,d4
		move.l	_finecosine(a4),a0
		move.l	(a0,d4.l*4),d0
		move.l	d3,d1
		jsr	(a1)
		move.l	_viewx(a4),_ds_xfrac(a4)
		add.l	d0,_ds_xfrac(a4)
		move.l	#_finesine,a0
		move.l	(a0,d4.l*4),d0
		move.l	d3,d1
		jsr	(a1)
		move.l	_viewy(a4),d1
		neg.l	d1
		move.l	d1,_ds_yfrac(a4)
		sub.l	d0,_ds_yfrac(a4)

		move.l	_fixedcolormap(a4),d0
		bne.b	.rm_FixedCMAP

		moveq	#LIGHTZSHIFT,d0
		lsr.l	d0,d7
		cmp.l	#MAXLIGHTZ,d7
		bmi.b	.rm_D7OK
		move.l	#MAXLIGHTZ-1,d7
.rm_D7OK:
		move.l	_planezlight(a4),a0
		move.l	(a0,d7.l*4),_ds_colormap(a4)

.rm_CMAPDone:
		move.l	16(sp),_ds_x2(a4)

		move.l	_spanfunc(a4),a0
		jsr	(a0)

		movem.l	(sp)+,d3/d4/d7

		rts
		cnop	0,4
.rm_FixedCMAP:
		move.l	d0,_ds_colormap(a4)
		bra.b	.rm_CMAPDone

;void
;R_MapPlane
;( int		y,
;  int		x1,
;  int		x2 )
;{
;    angle_t	angle;
;    fixed_t	distance;
;    fixed_t	length;
;    unsigned	index;
;
;    if (planeheight != cachedheight[y])
;    {
;	cachedheight[y] = planeheight;
;	distance = cacheddistance[y] = FixedMul (planeheight, yslope[y]);
;	ds_xstep = cachedxstep[y] = FixedMul (distance,basexscale);
;	ds_ystep = cachedystep[y] = FixedMul (distance,baseyscale);
;    }
;    else
;    {
;	distance = cacheddistance[y];
;	ds_xstep = cachedxstep[y];
;	ds_ystep = cachedystep[y];
;    }
;
;    length = FixedMul (distance,distscale[x1]);
;    angle = (viewangle + xtoviewangle[x1])>>ANGLETOFINESHIFT;
;    ds_xfrac = viewx + FixedMul(finecosine[angle], length);
;    ds_yfrac = -viewy - FixedMul(finesine[angle], length);
;
;    if (fixedcolormap)
;	ds_colormap = fixedcolormap;
;    else
;    {
;	index = distance >> LIGHTZSHIFT;
;	
;	if (index >= MAXLIGHTZ )
;	    index = MAXLIGHTZ-1;
;
;	ds_colormap = planezlight[index];
;    }
;	
;    ds_y = y;
;    ds_x1 = x1;
;    ds_x2 = x2;
;
;    // high or low detail
;    spanfunc ();	
;}

;-----------------------------------------------------------------------
; R_GetColumn (in r_data.c) by Arto Huusko <arto.huusko@pp.qnet.fi>

;		xref	@W_CacheLumpNum
		xref	@R_GenerateComposite
		xref	_texturewidthmask	; byte*
		xref	_texturecolumnlump	; short**
		xref	_texturecolumnofs	; unsigned short**
		xref	_texturecomposite	; byte**

;int*			texturewidthmask;
;fixed_t*		textureheight;		
;int*			texturecompositesize;
;short**		texturecolumnlump;
;unsigned short**	texturecolumnofs;
;byte**			texturecomposite;

		cnop	0,4
@R_GetColumn:
_R_GetColumn:
		movem.l	d2/d4,-(sp)

		move.l	_texturewidthmask(a4),a0
		and.l	(a0,d0.l*4),d1	; col &= texturewidthmask[tex]
		move.l	d0,d2		; Save the value of tex

		move.l	_texturecolumnofs(a4),a0
		move.l	(a0,d2.l*4),a0
		moveq	#0,d4
		move.w	(a0,d1.l*2),d4
		move.l	_texturecolumnlump(a4),a0
		move.l	(a0,d2.l*4),a0
		moveq	#0,d0
		move.w	(a0,d1.l*2),d0
		ble.b	.rg_1		; if lump >0

		moveq	#101,d1		; PU_CACHE
		jsr	(@W_CacheLumpNum)
		add.l	d4,d0
		movem.l	(sp)+,d2/d4
		rts

		cnop	0,4
.rg_1:
		move.l	_texturecomposite(a4),a0
		move.l	(a0,d2.l*4),d0
		bne.b	.rg_NotNull
		move.l	d2,d0
		jsr	(@R_GenerateComposite)
		move.l	_texturecomposite(a4),a0
		move.l	(a0,d2.l*4),d0
.rg_NotNull:
		add.l	d4,d0
		movem.l	(sp)+,d2/d4
		rts

;byte*
;R_GetColumn
;( int		tex,
;  int		col )
;{
;    int		lump;
;    int		ofs;
;	
;    col &= texturewidthmask[tex];
;    lump = texturecolumnlump[tex][col];
;    ofs = texturecolumnofs[tex][col];
;
;    if (lump > 0)
;	return (byte *)W_CacheLumpNum(lump,PU_CACHE)+ofs;
;
;   if (!texturecomposite[tex])
;	R_GenerateComposite (tex);
;
;   return texturecomposite[tex] + ofs;
;}

;-----------------------------------------------------------------------
; R_CacheLumpNum (in w_wad.c) by Arto Huusko <arto.huusko@pp.qnet.fi>

		xref	_numlumps	;int
		xref	_lumpcache	;void**

		xref	@W_LumpLength
		xref	@W_ReadLump
		xref	@Z_ChangeTag2
		xref	_I_Error
		xref	@Z_Malloc

		cnop	0,4
_W_CacheLumpNum:
@W_CacheLumpNum:
		cmp.l	_numlumps(a4),d0
		bpl.b	.wc_Error

		move.l	a2,-(sp)

		move.l	_lumpcache(a4),a2
		lea	(a2,d0.l*4),a2
		tst.l	(a2)
		beq.b	.wc_Miss
		move.l	(a2),a0
		move.l	d1,d0
		jsr	(@Z_ChangeTag2)
		move.l	(a2),d0

		move.l	(sp)+,a2
		rts

		cnop	0,4
.wc_Miss:
		movem.l	d2/d3,-(sp)
		move.l	d0,d2
		move.l	d1,d3
		jsr	(@W_LumpLength)
		move.l	d3,d1
		move.l	a2,a0
		jsr	(@Z_Malloc)
		move.l	d2,d0
		move.l	(a2),a0
		jsr	(@W_ReadLump)
		move.l	(a2),d0
		movem.l	(sp)+,d2/d3/a2
		rts
.wc_Error:
		move.l	#.wc_Msg,-(sp)
		move.l	d0,-(sp)
		jsr	(_I_Error)
		addq.l	#8,sp
		rts

.wc_Msg:
		dc.b	"W_CacheLumpNum: %i >= numlumps",0

;void*
;W_CacheLumpNum
;( int		lump,
;  int		tag )
;{
;    byte*	ptr;
;
;    if ((unsigned)lump >= numlumps)
;	I_Error ("W_CacheLumpNum: %i >= numlumps",lump);
;		
;    if (!lumpcache[lump])
;    {
;	// read the lump in
;	
;	//printf ("cache miss on lump %i\n",lump);
;	ptr = Z_Malloc (W_LumpLength (lump), tag, &lumpcache[lump]);
;	W_ReadLump (lump, lumpcache[lump]);
;    }
;    else
;    {
;	//printf ("cache hit on lump %i\n",lump);
;	Z_ChangeTag (lumpcache[lump],tag);
;    }
;	
;    return lumpcache[lump];
;}


;-----------------------------------------------------------------------
; V_DrawPatch (in v_video.c) by Arto Huusko <arto.huusko@pp.qnet.fi>

		xref	_screens	;byte* screens[5]

		xref	@I_MarkRect

;	STRUCTURE	patch,0
;	 WORD		width
;	 WORD		height
;	 WORD		leftoffset
;	 WORD		topoffset
;	 STRUCT		columnofs,9*4	;nine ints
;	LABEL		patch_size

width		equ	0
height		equ	2
leftoffset	equ	4
topoffset	equ	6
columnofs	equ	8
patch_size	equ	8+(9*4)

;column_t
;	STRUCTURE	column,0
;	 BYTE		topdelta
;	 BYTE		length
;	LABEL		column_size

topdelta	equ	0
length		equ	1
column_size	equ	2

		cnop	0,4
_V_DrawPatch:
@V_DrawPatch:
_V_DrawPatchDirect:
@V_DrawPatchDirect:
		movem.l	d3-d6/a2/a3/a5,-(sp)

		move.l	d0,d3	;x
		move.l	d1,d4	;y.. scrn in (sp), patch in a0

		move.l	a0,a2	;Store patch
		moveq	#0,d0
		move.w	topoffset(a2),d0
		rol.w	#8,d0	;SWAPSHORT
		ext.l	d0
		sub.l	d0,d4
		moveq	#0,d0
		move.w	leftoffset(a2),d0
		rol.w	#8,d0	;SWAPSHORT
		ext.l	d0
		sub.l	d0,d3

		move.l	32(sp),d6
		bne.b	.vd_ScrnOK
		move.l	d3,d0
		move.l	d4,d1
		moveq	#0,d5
		move.w	height(a2),d5
		rol.w	#8,d5
		move.l	d5,-(sp)
		move.w	width(a2),d5
		rol.w	#8,d5
		move.l	d5,-(sp)
		jsr	(@I_MarkRect)
		addq.l	#8,sp

.vd_ScrnOK:
		lea	_screens(a4),a0
		move.l	(a0,d6.l*4),d5
;Peter... change here (quite obvious)
		muls.l	_SCREENWIDTH(a4),d4	;y not needed further
		add.l	d3,d5	;+x
		add.l	d4,d5	;+y*SCREENWIDTH

		;D3=x, D5=desttop,
		moveq	#0,d6
		move.w	width(a2),d6
		rol.w	#8,d6	;SWAPSHORT
		;D6=w
		subq.l	#1,d6	;for ; col<w
		lea	columnofs(a2),a3	;prepare for columnofs[col]

.vd_Loop:
		move.l	(a3)+,d0
		rol.w	#8,d0
		swap	d0
		rol.w	#8,d0		;three instructions for SWAPLONG
		move.l	a2,a5		;column=patch+
		add.l	d0,a5		;... SWAPLONG(patch->columnofs[col])

		cmp.b	#$FF,(a5)
		beq.b	.vdl_Next	;last column

.vdl_Loop:
		move.l	d5,a1		;dest=desttop + 

;... here are the other references to SCREENWIDTH
;	lsl.l #8,x + lsl.l #6,x is equal to 256x+64x=320x

		moveq	#0,d0
		move.b	(a5),d0		;column->topdelta*
;;;		move.l	d0,d1	;!
;;;		lsl.l	#8,d0	;!
;;;		lsl.l	#6,d1	;!
;;;		add.l	d0,a1	;!
;;;		add.l	d1,a1	;!

		muls.l	_SCREENWIDTH(a4),d0
		add.l	d0,a1

		move.b	1(a5),d0
		addq.l	#3,a5		;source
		;Would it be possible to use the code from DrawColumn functions by Aki
		;here, too??
.vdl_DrawLoop:
		move.b	(a5)+,(a1)
		add.l	_SCREENWIDTH(a4),a1
		subq.b	#1,d0
		bne.b	.vdl_DrawLoop

		addq.l	#1,a5		;bump to next column..
		;bumped already by three and length, so one more. (column +=column->length+4)

		cmp.b	#$FF,(a5)
		bne.b	.vdl_Loop

.vdl_Next:
		addq.l	#1,d5

		dbf		d6,.vd_Loop
.vd_exit:
		movem.l	(sp)+,d3-d6/a2/a3/a5

		rts

;void
;V_DrawPatch
;( int		x,
;  int		y,
;  int		scrn,
;  patch_t*	patch ) 
;{ 
;
;    int		count;
;    int		col; 
;    column_t*	column; 
;    byte*	desttop;
;    byte*	dest;
;    byte*	source; 
;    int		w; 
;	 
;    y -= SWAPSHORT(patch->topoffset); 
;    x -= SWAPSHORT(patch->leftoffset); 
; 
;    col = 0; 
;    desttop = screens[scrn]+y*SCREENWIDTH+x; 
;	 
;    w = SWAPSHORT(patch->width); 
;
;    for ( ; col<w ; x++, col++, desttop++)
;    { 
;	column = (column_t *)((byte *)patch + SWAPLONG(patch->columnofs[col])); 
; 
;	// step through the posts in a column 
;	while (column->topdelta != 0xff ) 
;	{ 
;	    source = (byte *)column + 3; 
;	    dest = desttop + column->topdelta*SCREENWIDTH; 
;	    count = column->length; 
;			 
;	    while (count--) 
;	    { 
;		*dest = *source++; 
;		dest += SCREENWIDTH; 
;	    } 
;	    column = (column_t *)(  (byte *)column + column->length 
;				    + 4 ); 
;	}
;    }
;    if (!scrn)
;	I_MarkRect (x, y, SWAPSHORT(patch->width), SWAPSHORT(patch->height)); 
;
;} 

;-----------------------------------------------------------------------
; P_DivLineSide (in p_sight.c) by Arto Huusko <arto.huusko@pp.qnet.fi>

;	STRUCTURE	divline,0
;	 LONG		x	;all are actually fixed_t values..
;	 LONG		y
;	 LONG		dx
;	 LONG		dy
;	LABEL		divline_size

x		equ	0
y		equ	4
dx		equ	8
dy		equ	12
divline_size	equ	16

@P_DivlineSide:
_P_DivlineSide:
		;I bet this could be a little faster if someone profiled the input
		;and found which case all in all happens most often...

		tst.l	dx(a0)
		bne.b	.pd_DXOK
		cmp.l	(a0),d0
		bgt.b	.pd_2	;x>node->x
		bne.b	.pd_1
		moveq	#2,d0
		rts
.pd_1:
		move.l	dy(a0),d0	;return node->dy >0
		beq.b	.pd1_Exit	;=0 => FALSE, no need to set D0
		bmi.b	.pd1_False	;<0 => FALSE
.pd1_True:
		moveq	#1,d0	;node->dy > 0 => TRUE
		rts
.pd1_False:
		moveq	#0,d0
.pd1_Exit:
		rts
.pd_2:
		move.l	dy(a0),d0
		beq.b	.pd1_Exit
		bmi.b	.pd1_True
		moveq	#0,d0	;node->dy >0 =>FALSE
		rts

.pd_DXOK:
		tst.l	dy(a0)
		bne.b	.pd_DYOK
		cmp.l	y(a0),d0
		bne.b	.pd_3
		moveq	#2,d0
		rts
.pd_3:
		cmp.l	y(a0),d1
		bgt.b	.pd_4
		move.l	dx(a0),d0
		bmi.b	.pd3_True
.pd3_False:
		moveq	#0,d0
.pd3_Exit:
		rts
.pd3_True:
		moveq	#1,d0
		rts
.pd_4:
		move.l	dx(a0),d0
		bmi.b	.pd3_False
		moveq	#1,d0
		rts

.pd_DYOK:
		sub.l	(a0),d0		;x-node->x
		sub.l	y(a0),d1	;y-node->y	
		swap	d0		;dx>>FRACBITS
		swap	d1		;dy>>FRACBITS

		muls.w	dy(a0),d0	;node->dy>>FRACBITS is simply the high word of node->dy
		muls.w	dx(a0),d1	;since this is only word and so is dx>>FRACBITS
		;we gain some clocks by using muls.W

		;d0=left, d1=right
		cmp.l	d0,d1
		bmi.b	.pd_Return0	;if right<left return 0
		beq.b	.pd_Return2	;if left==right return 2
		moveq	#1,d0
		rts
.pd_Return2:
		moveq	#2,d0	
		rts
.pd_Return0:
		moveq	#0,d0
		rts

;int
;P_DivlineSide
;( fixed_t	x,
;  fixed_t	y,
;  divline_t*	node )
;{
;    fixed_t	dx;
;    fixed_t	dy;
;    fixed_t	left;
;    fixed_t	right;
;
;    if (!node->dx)
;    {
;	if (x==node->x)
;	    return 2;
;	
;	if (x <= node->x)
;	    return node->dy > 0;
;
;	return node->dy < 0;
;    }
;    
;    if (!node->dy)
;    {
;	if (x==node->y)
;	    return 2;
;
;	if (y <= node->y)
;	    return node->dx < 0;
;
;	return node->dx > 0;
;    }
;	
;    dx = (x - node->x);
;    dy = (y - node->y);
;
;    left =  (node->dy>>FRACBITS) * (dx>>FRACBITS);
;    right = (dy>>FRACBITS) * (node->dx>>FRACBITS);
;	
;    if (right < left)
;	return 0;	// front side
;    
;    if (left == right)
;	return 2;
;    return 1;		// back side
;}

;***********************************************************************

		end