;THE COMPUTER CODE CONTAINED HEREIN IS THE SOLE PROPERTY OF PARALLAX
;SOFTWARE CORPORATION ("PARALLAX").  PARALLAX, IN DISTRIBUTING THE CODE TO
;END-USERS, AND SUBJECT TO ALL OF THE TERMS AND CONDITIONS HEREIN, GRANTS A
;ROYALTY-FREE, PERPETUAL LICENSE TO SUCH END-USERS FOR USE BY SUCH END-USERS
;IN USING, DISPLAYING,  AND CREATING DERIVATIVE WORKS THEREOF, SO LONG AS
;SUCH USE, DISPLAY OR CREATION IS FOR NON-COMMERCIAL, ROYALTY OR REVENUE
;FREE PURPOSES.  IN NO EVENT SHALL THE END-USER USE THE COMPUTER CODE
;CONTAINED HEREIN FOR REVENUE-BEARING PURPOSES.  THE END-USER UNDERSTANDS
;AND AGREES TO THE TERMS HEREIN AND ACCEPTS THE SAME BY USE OF THIS FILE.  
;COPYRIGHT 1993-1998 PARALLAX SOFTWARE CORPORATION.  ALL RIGHTS RESERVED.
;
; $Source: f:/miner/source/texmap/rcs/tmap_per.asm $
; $Revision: 1.26 $
; $Author: john $
; $Date: 1995/02/20 18:22:55 $
;
; Perspective texture mapper inner loop.
;
; $Log: tmap_per.asm $
; Revision 1.26  1995/02/20  18:22:55  john
; Put all the externs in the assembly modules into tmap_inc.asm.
; Also, moved all the C versions of the inner loops into a new module, 
; scanline.c.
; 
; Revision 1.25  1995/02/20  17:09:08  john
; Added code so that you can build the tmapper with no assembly!
; 
; Revision 1.24  1995/01/10  09:32:07  mike
; mostly fix garbage at end of scanline, but slow down by 1-4%.
; 
; Revision 1.23  1994/12/02  23:29:57  mike
; optimizations.
; 
; Revision 1.22  1994/11/30  00:57:00  mike
; optimization.
; 
; Revision 1.21  1994/11/21  13:57:42  mike
; fix right side shear bug
; 
; Revision 1.20  1994/11/12  16:41:09  mike
; jae -> ja.
; 
; Revision 1.19  1994/10/27  19:40:00  john
; Made lighting table lookup be _gr_fade_table[eax] instead
; of fs:[eax], which gets rig of a segment override that 
; supposedly costs 1 clock on a 486.  Mainly, I wanted to verify
; that the only reason we need selectors is for the source texture
; data .
; 
; Revision 1.18  1994/05/03  11:08:32  mike
; Trap divide overflows.
; 
; Revision 1.17  1994/04/21  15:03:41  mike
; make faster.
; 
; Revision 1.16  1994/04/08  16:46:57  john
; Made 32 fade levels. Hacked.
; 
; Revision 1.15  1994/03/31  08:35:18  mike
; Fix quantized-by-4 bug in inner loop.
; 
; Revision 1.14  1994/03/14  17:41:14  mike
; Fix bug in unlighted version.
; 
; Revision 1.13  1994/03/14  15:45:14  mike
; streamline code.
; 
; Revision 1.12  1994/01/14  14:01:58  mike
; *** empty log message ***
; 
; Revision 1.11  1993/12/18  14:43:44  john
; Messed around with doing 1/z, the u*(1/z) and v*(1/z)
; (Went from 23 fps to 21 fps... not good! )
;
; Revision 1.10  1993/12/17  16:14:17  john
; Split lighted/nonlighted, so there is no cmp lighting
; in the inner loop. 
; 
; Revision 1.9  1993/12/17  12:34:29  john
; Made leftover bytes use linear approx instead of correct...
; should save about 8 divides per scanline on average.
; Also, took out anti-aliasing code and rearranged to
; order of some instructions to help on 486 pipelining.
; (The anti-aliasing code did *not* look good, so I
; figure there was no reason to keep it in. )
; 
; Revision 1.8  1993/12/16  18:37:52  mike
; Align some stuff on 4 byte boundaries.
; 
; Revision 1.7  1993/11/30  08:44:18  john
; Made selector set check for < 64*64 bitmaps.
; 
; Revision 1.6  1993/11/23  17:25:26  john
; Added safety "and eax, 0fffh" in lighting lookup.
; 
; Revision 1.5  1993/11/23  15:08:52  mike
; Fixed lighting bug. 
; 
; Revision 1.4  1993/11/23  14:38:50  john
; optimized NORMAL code by switching EBX and ESI, so BH can be used in
; the lighting process.
; 
; Revision 1.3  1993/11/23  14:30:53  john
; Made the perspective tmapper do 1/8 divides; added lighting.
; 
; Revision 1.2  1993/11/22  10:24:59  mike
; *** empty log message ***
; 
; Revision 1.1  1993/09/08  17:29:53  mike
; Initial revision
; 
;
;

	.386

	public	asm_tmap_scanline_per_

	include	tmap_inc.asm

	public _max_ecx,_min_ecx


_DATA	SEGMENT DWORD PUBLIC USE32 'DATA'
	align	4
mem_edx dd ?
;**_v_window_left	dd	_window_left
;**_v_window_right	dd	_window_right
;**_v_window_top	dd	_window_top
;**_v_window_bottom	dd	_window_bottom

; ---------- These are passed in by the C caller ----------
; ----------^^ These are passed in by the C caller ^^----------

	public	_x,_loop_count, new_end
	align	4
_x	dd	?
_loop_count	dd	?

_max_ecx	dd	0
_min_ecx	dd	55555555h

extern _per2_flag:dword
new_end	dd	1	; if set, use new, but slower, way of finishing off extra pixels on scanline, 01/10/95 --MK


	public	_scan_doubling_flag, _linear_if_far_flag

_scan_doubling_flag	dd 0
_linear_if_far_flag	dd 0

;---------- local variables
	align	4
req_base	dd	?
req_size	dd	?
U0	dd	?
U1	dd	?
V0	dd	?
V1	dd	?
num_left_over	dd	?
DU1	dd	?
DV1	dd	?
DZ1	dd	?
;**_fx_dl_dx1	dd	?
;**_fx_dl_dx2	dd	?

_DATA	ENDS

DGROUP	GROUP	_DATA


_TEXT   SEGMENT PARA PUBLIC USE32 'CODE'
	ASSUME	DS:_DATA
	ASSUME	CS:_TEXT

; --------------------------------------------------------------------------------------------------
; Enter:
;	_xleft	fixed point left x coordinate
;	_xright	fixed point right x coordinate
;	_y	fixed point y coordinate
;	_pixptr	address of source pixel map
;	_u	fixed point initial u coordinate
;	_v	fixed point initial v coordinate
;	_z	fixed point initial z coordinate
;	_du_dx	fixed point du/dx
;	_dv_dx	fixed point dv/dx
;	_dz_dx	fixed point dz/dx

;   for (x = (int) xleft; x <= (int) xright; x++) {
;      _setcolor(read_pixel_from_tmap(srcb,((int) (u/z)) & 63,((int) (v/z)) & 63));
;      _setpixel(x,y);
;
;      u += du_dx;
;      v += dv_dx;
;      z += dz_dx;
;   }


;;goto_none_to_do:
;; int 3	; thinking this can't get hit, if so, kill compare against _window_bottom
;; jmp _none_to_do
	align	16
asm_tmap_scanline_per_:
	push	es
	pusha
	
;---------------------------- setup for loop ---------------------------------
; Setup for loop:	_loop_count  iterations = (int) xright - (int) xleft
;	esi	source pixel pointer = pixptr
;	edi	initial row pointer = y*320+x
; NOTE: fx_xright and fx_xleft changed from fix to int by mk on 12/01/94.

; set esi = pointer to start of texture map data

; set edi = address of first pixel to modify
	mov	edi,_fx_y
	mov	es,_pixel_data_selector	; selector[0*2]
;;	cmp	edi,_window_bottom
;;	ja	goto_none_to_do

	mov	edi,_y_pointers[edi*4]

	mov	ebx,_fx_xleft
	test	ebx, ebx
	jns	ebx_ok
	xor	ebx, ebx
ebx_ok:	add	edi,write_buffer
	add	edi,ebx

; set _loop_count = # of iterations
	mov	eax,_fx_xright
;;	cmp	eax,_window_right
;;	jl	eax_ok1
;;	mov	eax,_window_right
;;eax_ok1:
;;	cmp	eax,_window_left
;;	jg	eax_ok2
;;	mov	eax,_window_left
;;eax_ok2:
	sub	eax,ebx
	js	_none_to_do
;;	cmp	eax,_window_width
;;	jbe	_ok_to_do
;;	mov	eax,_window_width
;;_ok_to_do:
	mov	_loop_count,eax

; lighting values are passed in fixed point, but need to be in 8 bit integer, 8 bit fraction so we can easily
; get the integer by reading %bh
	sar	_fx_l, 8
	sar	_fx_dl_dx,8
	jns	dl_dx_ok
	inc	_fx_dl_dx	; round towards 0 for negative deltas
dl_dx_ok:

; set initial values
	mov	ebx,_fx_u
	mov	ebp,_fx_v
	mov	ecx,_fx_z

	test	_per2_flag,-1
	je	tmap_loop

	test	_Lighting_on, -1
	je	tmap_loop_fast_nolight
	jmp	tmap_loop_fast

;================ PERSPECTIVE TEXTURE MAP INNER LOOPS ========================
;
; Usage in loop:	eax	division, pixel value
;	ebx	u
;	ecx	z
;	edx	division
;	ebp	v
;	esi	source pixel pointer
;	edi	destination pixel pointer

;-------------------- NORMAL PERSPECTIVE TEXTURE MAP LOOP -----------------
tmap_loop:
	mov	esi, ebx	; esi becomes u coordinate

	align	4
tmap_loop0:

; compute v coordinate
	mov	eax, ebp	; get v
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (v/z)

	and	eax,3fh	; mask with height-1
	mov	ebx,eax

; compute u coordinate
	mov	eax, esi	; get u
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (u/z)

	shl 	eax,26
	shld 	ebx,eax,6	; esi = v*64+u

; read 1 pixel
	xor	eax, eax
	test	_Lighting_on, -1
	mov	al, es:[ebx]	; get pixel from source bitmap
	je	NoLight1

; LIGHTING CODE
	mov	ebx, _fx_l	; get temp copy of lighting value
	mov	ah, bh	; get lighting level
	add	ebx, _fx_dl_dx	; update lighting value
	mov	al, _gr_fade_table[eax]	; xlat pixel thru lighting tables
	mov	_fx_l, ebx	; save temp copy of lighting value

; transparency check
NoLight1:	cmp	al,255
	je	skip1

	mov	[edi],al
skip1:	inc	edi
	
; update deltas
	add	ebp,_fx_dv_dx
	add	esi,_fx_du_dx
	add	ecx,_fx_dz_dx
	je	_div_0_abort	; would be dividing by 0, so abort

	dec	_loop_count
	jns	tmap_loop0

_none_to_do:	
	popa
	pop	es
	ret

; We detected a z=0 condition, which seems pretty bogus, don't you think?
; So, we abort, but maybe we want to know about it.
_div_0_abort:
;**	int	3
	jmp	_none_to_do

;-------------------------- PER/4 TMAPPER ----------------
; 
;	x = x1
;	U0 = u/w; V0 = v/w;
;	while ( 1 )
;		u += du_dx*4; v+= dv_dx*4
;		U1 = u/w; V1 = v/w;
;		DUDX = (U1-U0)/4; DVDX = (V1-V0)/4;
;
;	; Pixel 0
;		pixels = texmap[V0*64+U0];
;		U0 += DUDX; V0 += DVDX
;	; Pixel 1
;		pixels = (pixels<<8)+texmap[V0*64+U0];
;		U0 += DUDX; V0 += DVDX
;	; Pixel 2
;		pixels = (pixels<<8)+texmap[V0*64+U0];
;		U0 += DUDX; V0 += DVDX
;	; Pixel 3
;		pixels = (pixels<<8)+texmap[V0*64+U0];
;
;		screen[x] = pixel
;		x += 4;
;		U0 = U1; V0 = V1 

NBITS = 4	; 2^NBITS pixels plotted per divide
ZSHIFT = 4	; precision used in PDIV macro


PDIV MACRO
; Returns EAX/ECX in 16.16 format in EAX. Trashes EDX
;          sig bits   6.3
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT

ENDM

wr_onepix	macro	num
	local	skip
	cmp	cl,255
	je	skip
	mov	num[edi],cl
skip:	ror	ecx,8

	endm

public tmap_loop_fast

; -------------------------------------- Start of Getting Dword Aligned ----------------------------------------------
;	ebx	fx_u

tmap_loop_fast:
	mov	esi,ebx

	align	4
NotDwordAligned1:
	test	edi, 11b
	jz	DwordAligned1

; compute v coordinate
	mov	eax, ebp	; get v
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (v/z)

	and	eax,3fh	; mask with height-1
	mov	ebx,eax

; compute u coordinate
	mov	eax, esi	; get u
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (u/z)

	shl 	eax,26
	shld 	ebx,eax,6	; esi = v*64+u

; read 1  pixel
	xor	eax, eax
	mov	al, es:[ebx]	; get pixel from source bitmap

; lighting code
	mov	ebx, _fx_l	; get temp copy of lighting value
	mov	ah, bh	; get lighting level
	add	ebx, _fx_dl_dx	; update lighting value
	mov	_fx_l, ebx	; save temp copy of lighting value

; transparency check
	cmp	al,255
	je	skip2	; this pixel is transparent, so don't write it (or light it)

	mov	al, _gr_fade_table[eax]	; xlat pixel thru lighting tables
					
; write 1 pixel
	mov	[edi],al
skip2:	inc	edi
	
; update deltas
	add	ebp,_fx_dv_dx
	add	esi,_fx_du_dx
	add	ecx,_fx_dz_dx
	je	_div_0_abort	; would be dividing by 0, so abort

	dec	_loop_count
	jns	NotDwordAligned1

	jmp	_none_to_do

; -------------------------------------- End of Getting Dword Aligned ----------------------------------------------

DwordAligned1:

;--;	mov	ebx,esi	; get fx_u
;--;
;--;	mov	eax, _loop_count
;--;	inc	eax
;--;	mov	num_left_over, eax
;--;	shr	eax, NBITS
;--;
;--;	test	eax, -1
;--;	je	tmap_loop	; there are no 2^NBITS chunks, do divide/pixel for whole scanline
;--;	
;--;	mov 	_loop_count, eax	; _loop_count = pixels / NPIXS
;--;	shl	eax, NBITS
;--;	sub	num_left_over, eax	; num_left_over = obvious

	mov	eax, _loop_count
	mov	ebx, esi	; get fx_u [pentium pipelining]
	inc	eax
	mov	esi, eax
	and	esi, (1 shl NBITS) - 1
	sar	eax, NBITS
	mov	num_left_over, esi
	je	tmap_loop	; there are no 2^NBITS chunks, do divide/pixel for whole scanline
	mov 	_loop_count, eax	; _loop_count = pixels / NPIXS
		
; compute initial v coordinate
	mov	eax,ebp	; get v
	mov	edx,ebp
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	V0, eax

; compute initial u coordinate
	mov	eax,ebx	; get u
	mov	edx,ebx
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	U0, eax

; Set deltas to NPIXS pixel increments
	mov	eax, _fx_du_dx
	shl	eax, NBITS
	mov	DU1, eax
	mov	eax, _fx_dv_dx
	shl	eax, NBITS
	mov	DV1, eax
	mov	eax, _fx_dz_dx
	shl	eax, NBITS
	mov	DZ1, eax

	align	4
TopOfLoop4:
	add	ebx, DU1
	add	ebp, DV1
	add	ecx, DZ1
	je	_div_0_abort	; would be dividing by 0, so abort

; Done with ebx, ebp, ecx until next iteration
	push	ebx
	push	ecx
	push	ebp
	push	edi
	
; Find fixed U1		
	mov	eax, ebx
	mov	edx,ebx
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	ebx, eax	; ebx = U1 until pop's

; Find fixed V1		
	mov	eax, ebp
	mov	edx, ebp
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)

	mov	ecx, U0	; ecx = U0 until pop's
	mov	edi, V0	; edi = V0 until pop's

	shl	eax, 16-ZSHIFT
	mov	ebp, eax	; ebp = V1 until pop's

; Make ESI =  V0:U0 in 6:10,6:10 format
	mov	eax, ecx
	shr	eax, 6
	mov	esi, edi
	shl	esi, 10
	mov	si, ax
		
; Make EDX = DV:DU in 6:10,6:10 format
	mov	eax, ebx
	sub	eax, ecx
	sar	eax, NBITS+6
	mov	edx, ebp
	sub	edx, edi
	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
	mov	dx, ax	; put delta u in low word

; Save the U1 and V1 so we don't have to divide on the next iteration
	mov	U0, ebx
	mov	V0, ebp

	pop	edi	; Restore EDI before using it
		
; LIGHTING CODE
	mov	ebx, _fx_l
	mov	ebp, _fx_dl_dx

	test	_transparency_on,-1
	je	no_trans1

    REPT (1 SHL (NBITS-2))
	local	skip3,no_trans1
        REPT 2
	local	skipa1,skipa2

	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
	mov 	al, es:[eax]	; get pixel from source bitmap
	cmp	al,255
	je	skipa1
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
	mov	[edi],al
skipa1:
	inc	edi

; Do odd pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
	mov 	al, es:[eax]	; get pixel from source bitmap
	cmp	al,255
	je	skipa2
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
	mov	[edi],al
skipa2:
	inc	edi

        ENDM

    ENDM
	jmp	cont1

; -------------------------------------------------------
no_trans1:
;----------; push edx
;----------; mov mem_edx, edx
;----------; sub edx, edx
  REPT (1 SHL (NBITS-2))
    REPT 2

	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
	mov 	al, es:[eax]	; get pixel from source bitmap
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	mov	cl, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer

; Do odd pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
	mov 	al, es:[eax]	; get pixel from source bitmap
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	mov	ch, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer

; ----- This is about 1% faster than the above, and could probably be optimized more.
; ----- Problem is, it gets the u,v coordinates backwards.  What you would need to do
; ----- is switch the packing of the u,v coordinates above (about 95 lines up).
;----------;	mov	eax, esi
;----------;	shr	ax, 10
;----------;	rol	eax, 6
;----------;	mov	dx, ax
;----------;	add	esi, mem_edx
;----------;	mov	dl, es:[edx]
;----------;	mov	dh, bh
;----------;	add	ebx, ebp
;----------;	mov	cl, _gr_fade_table[edx]
;----------;
;----------;	mov	eax, esi
;----------;	shr	ax, 10
;----------;	rol	eax, 6
;----------;	mov	dx, ax
;----------;	add	esi, mem_edx
;----------;	mov	dl, es:[edx]
;----------;	mov	dh, bh
;----------;	add	ebx, ebp
;----------;	mov	ch, _gr_fade_table[edx]

	ror	ecx, 16	; move to next double dest pixel position

    ENDM
	mov 	[edi],ecx	; Draw 4 pixels to display
	add 	edi,4
  ENDM
;; pop edx
cont1:

; -------------------------------------------------------

; LIGHTING CODE
	mov	_fx_l, ebx
	pop	ebp
	pop	ecx
	pop	ebx
	dec	_loop_count
	jnz	TopOfLoop4

EndOfLoop4:
	test	num_left_over, -1
	je	_none_to_do

; ----------------------------------------- Start of LeftOver Pixels ------------------------------------------
DoEndPixels:
	push	ecx

	mov	eax, ecx
	lea	eax, [eax*2+eax]

	add	ecx, DZ1
	js	notokhere
	shl	ecx,2
	cmp	eax, ecx
	pop	ecx
	jl	okhere
	jmp	bah_bah
notokhere:
	pop	ecx
bah_bah:
	test	new_end,-1
	jne	NewDoEndPixels
okhere:

	add	ebx, DU1
	add	ebp, DV1
	add	ecx, DZ1
	je	_div_0_abort
	jns	dep_cont

; z went negative.
; this can happen because we added DZ1 to the current z, but dz1 represents dz for perhaps 16 pixels
; though we might only plot one more pixel.
	mov	cl, 1

dep_loop:	mov	eax, DU1
	sar	eax, cl
	sub	ebx, eax

	mov	eax, DV1
	sar	eax, cl
	sub	ebp, eax

	mov	eax, DZ1
	sar	eax, cl
	sub	ecx, eax
	je	_div_0_abort
	jns	dep_cont

	inc	cl
	cmp	cl, NBITS
	jne	dep_loop

dep_cont:
	push	edi	; use edi as a temporary variable

	cmp	ecx,1 shl (ZSHIFT+1)
	jg	ecx_ok
	mov	ecx, 1 shl (ZSHIFT+1)
ecx_ok:

; Find fixed U1		
	mov	eax, ebx
	PDIV
	mov	ebx, eax	; ebx = U1 until pop's

; Find fixed V1		
	mov	eax, ebp
	PDIV
	mov	ebp, eax	; ebp = V1 until pop's

	mov	ecx, U0	; ecx = U0 until pop's
	mov	edi, V0	; edi = V0 until pop's

; Make ESI =  V0:U0 in 6:10,6:10 format
	mov	eax, ecx
	shr	eax, 6
	mov	esi, edi
	shl	esi, 10
	mov	si, ax
		
; Make EDX = DV:DU in 6:10,6:10 format
	mov	eax, ebx
	sub	eax, ecx
	sar	eax, NBITS+6
	mov	edx, ebp
	sub	edx, edi
	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
	mov	dx, ax	; put delta u in low word

	pop	edi	; Restore EDI before using it
		
	mov	ecx, num_left_over

; LIGHTING CODE
	mov	ebx, _fx_l
	mov	ebp, _fx_dl_dx

	ITERATION = 0
	REPT (1 SHL (NBITS-1))
	local	skip4, skip5
; Do even pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	mov 	al, es:[eax]	; get pixel from source bitmap
	add	esi, edx	; inc u,v
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	cmp	al,255
	je	skip4
	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
	mov	[edi+ITERATION], al	; write pixel
skip4:	dec	ecx
	jz	_none_to_do
	ITERATION = ITERATION + 1

; Do odd pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	mov 	al, es:[eax]	; get pixel from source bitmap
	add	esi, edx	; inc u,v
	mov	ah, bh	; form lighting table lookup value
	add	ebx, _fx_dl_dx	; update lighting value
	cmp	al,255
	je	skip5
	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
	mov	[edi+ITERATION], al	; write pixel
skip5:	dec	ecx
	jz	_none_to_do
	ITERATION = ITERATION + 1
	ENDM

; Should never get here!!!!
	int	3
	jmp	_none_to_do

; ----------------------------------------- End of LeftOver Pixels ------------------------------------------

; --BUGGY NEW--NewDoEndPixels:
; --BUGGY NEW--	mov	eax, num_left_over
; --BUGGY NEW--	and	num_left_over, 3
; --BUGGY NEW--	shr	eax, 2
; --BUGGY NEW--	je	NDEP_1
; --BUGGY NEW--	mov	_loop_count, eax
; --BUGGY NEW--
; --BUGGY NEW--; do 4 pixels per hunk, not 16, so div deltas by 4 (16/4=4)
; --BUGGY NEW-- shr DU1,2
; --BUGGY NEW-- shr DV1,2
; --BUGGY NEW-- shr DZ1,2
; --BUGGY NEW--
; --BUGGY NEW--NDEP_TopOfLoop4:
; --BUGGY NEW--	add	ebx, DU1
; --BUGGY NEW--	add	ebp, DV1
; --BUGGY NEW--	add	ecx, DZ1
; --BUGGY NEW--	je	_div_0_abort	; would be dividing by 0, so abort
; --BUGGY NEW--
; --BUGGY NEW--; Done with ebx, ebp, ecx until next iteration
; --BUGGY NEW--	push	ebx
; --BUGGY NEW--	push	ecx
; --BUGGY NEW--	push	ebp
; --BUGGY NEW--	push	edi
; --BUGGY NEW--	
; --BUGGY NEW--; Find fixed U1		
; --BUGGY NEW--	mov	eax, ebx
; --BUGGY NEW--	mov	edx,ebx
; --BUGGY NEW--	shl	eax,(ZSHIFT-2)
; --BUGGY NEW--	sar	edx,32-(ZSHIFT-2)
; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
; --BUGGY NEW--	shl	eax, 16-(ZSHIFT-2)
; --BUGGY NEW--	mov	ebx, eax	; ebx = U1 until pop's
; --BUGGY NEW--
; --BUGGY NEW--; Find fixed V1		
; --BUGGY NEW--	mov	eax, ebp
; --BUGGY NEW--	mov	edx, ebp
; --BUGGY NEW--	shl	eax,(ZSHIFT-2)
; --BUGGY NEW--	sar	edx,32-(ZSHIFT-2)
; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
; --BUGGY NEW--
; --BUGGY NEW--	mov	ecx, U0	; ecx = U0 until pop's
; --BUGGY NEW--	mov	edi, V0	; edi = V0 until pop's
; --BUGGY NEW--
; --BUGGY NEW--	shl	eax, 16-(ZSHIFT-2)
; --BUGGY NEW--	mov	ebp, eax	; ebp = V1 until pop's
; --BUGGY NEW--
; --BUGGY NEW--; Make ESI =  V0:U0 in 6:10,6:10 format
; --BUGGY NEW--	mov	eax, ecx
; --BUGGY NEW--	shr	eax, 6
; --BUGGY NEW--	mov	esi, edi
; --BUGGY NEW--	shl	esi, 10
; --BUGGY NEW--	mov	si, ax
; --BUGGY NEW--
; --BUGGY NEW--; Make EDX = DV:DU in 6:10,6:10 format
; --BUGGY NEW--	mov	eax, ebx
; --BUGGY NEW--	sub	eax, ecx
; --BUGGY NEW--	sar	eax, (NBITS-2)+6
; --BUGGY NEW--	mov	edx, ebp
; --BUGGY NEW--	sub	edx, edi
; --BUGGY NEW--	shl	edx, 10-(NBITS-2)	; EDX = V1-V0/ 4 in 6:10 int:frac
; --BUGGY NEW--	mov	dx, ax	; put delta u in low word
; --BUGGY NEW--
; --BUGGY NEW--; Save the U1 and V1 so we don't have to divide on the next iteration
; --BUGGY NEW--	mov	U0, ebx
; --BUGGY NEW--	mov	V0, ebp
; --BUGGY NEW--
; --BUGGY NEW--	pop	edi	; Restore EDI before using it
; --BUGGY NEW--		
; --BUGGY NEW--; LIGHTING CODE
; --BUGGY NEW--	mov	ebx, _fx_l
; --BUGGY NEW--	mov	ebp, _fx_dl_dx
; --BUGGY NEW--
; --BUGGY NEW--;**	test	_transparency_on,-1
; --BUGGY NEW--;**	je	NDEP_no_trans1
; --BUGGY NEW--
; --BUGGY NEW--        REPT 2
; --BUGGY NEW--	local	NDEP_skipa1, NDEP_skipa2
; --BUGGY NEW--
; --BUGGY NEW--	mov	eax, esi	; get u,v
; --BUGGY NEW--	shr	eax, 26	; shift out all but int(v)
; --BUGGY NEW--	shld	ax,si,6	; shift in u, shifting up v
; --BUGGY NEW--	add	esi, edx	; inc u,v
; --BUGGY NEW--	mov 	al, es:[eax]	; get pixel from source bitmap
; --BUGGY NEW--	cmp	al,255
; --BUGGY NEW--	je	NDEP_skipa1
; --BUGGY NEW--	mov	ah, bh	; form lighting table lookup value
; --BUGGY NEW--	add	ebx, ebp	; update lighting value
; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
; --BUGGY NEW--	mov	[edi],al
; --BUGGY NEW--NDEP_skipa1:
; --BUGGY NEW--	inc	edi
; --BUGGY NEW--
; --BUGGY NEW--; Do odd pixel
; --BUGGY NEW--	mov	eax, esi	; get u,v
; --BUGGY NEW--	shr	eax, 26	; shift out all but int(v)
; --BUGGY NEW--	shld	ax,si,6	; shift in u, shifting up v
; --BUGGY NEW--	add	esi, edx	; inc u,v
; --BUGGY NEW--	mov 	al, es:[eax]	; get pixel from source bitmap
; --BUGGY NEW--	cmp	al,255
; --BUGGY NEW--	je	NDEP_skipa2
; --BUGGY NEW--	mov	ah, bh	; form lighting table lookup value
; --BUGGY NEW--	add	ebx, ebp	; update lighting value
; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
; --BUGGY NEW--	mov	[edi],al
; --BUGGY NEW--NDEP_skipa2:
; --BUGGY NEW--	inc	edi
; --BUGGY NEW--
; --BUGGY NEW--        ENDM
; --BUGGY NEW--
; --BUGGY NEW--	mov	_fx_l, ebx
; --BUGGY NEW--	pop	ebp
; --BUGGY NEW--	pop	ecx
; --BUGGY NEW--	pop	ebx
; --BUGGY NEW--	dec	_loop_count
; --BUGGY NEW--	jnz	NDEP_TopOfLoop4
; --BUGGY NEW--
; --BUGGY NEW--	test	num_left_over, -1
; --BUGGY NEW--	je	_none_to_do
; --BUGGY NEW--
; --BUGGY NEW--NDEP_1:
; --BUGGY NEW--	mov	esi,ebx
; --BUGGY NEW--
; --BUGGY NEW--	align	4
; --BUGGY NEW--NDEP_loop:
; --BUGGY NEW--
; --BUGGY NEW--; compute v coordinate
; --BUGGY NEW--	mov	eax, ebp	; get v
; --BUGGY NEW--	mov	edx, eax
; --BUGGY NEW--	sar	edx, 31
; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
; --BUGGY NEW--
; --BUGGY NEW--	and	eax,3fh	; mask with height-1
; --BUGGY NEW--	mov	ebx,eax
; --BUGGY NEW--
; --BUGGY NEW--; compute u coordinate
; --BUGGY NEW--	mov	eax, 	esi	; get u
; --BUGGY NEW--	mov	edx, eax
; --BUGGY NEW--	sar	edx, 31
; --BUGGY NEW--	idiv	ecx	; eax = (u/z)
; --BUGGY NEW--
; --BUGGY NEW--	shl 	eax,26
; --BUGGY NEW--	shld 	ebx,eax,6	; esi = v*64+u
; --BUGGY NEW--
; --BUGGY NEW--; read 1  pixel
; --BUGGY NEW--	xor	eax, eax
; --BUGGY NEW--	mov	al, es:[ebx]	; get pixel from source bitmap
; --BUGGY NEW--
; --BUGGY NEW--; lighting code
; --BUGGY NEW--	mov	ebx, _fx_l	; get temp copy of lighting value
; --BUGGY NEW--	mov	ah, bh	; get lighting level
; --BUGGY NEW--	add	ebx, _fx_dl_dx	; update lighting value
; --BUGGY NEW--	mov	_fx_l, ebx	; save temp copy of lighting value
; --BUGGY NEW--
; --BUGGY NEW--; transparency check
; --BUGGY NEW--	cmp	al,255
; --BUGGY NEW--	je	NDEP_skip2	; this pixel is transparent, so don't write it (or light it)
; --BUGGY NEW--
; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat pixel thru lighting tables
; --BUGGY NEW--					
; --BUGGY NEW--; write 1 pixel
; --BUGGY NEW--	mov	[edi],al
; --BUGGY NEW--NDEP_skip2:	inc	edi
; --BUGGY NEW--	
; --BUGGY NEW--; update deltas
; --BUGGY NEW--	add	ebp,_fx_dv_dx
; --BUGGY NEW--	add	esi,_fx_du_dx
; --BUGGY NEW--	add	ecx,_fx_dz_dx
; --BUGGY NEW--	je	_div_0_abort	; would be dividing by 0, so abort
; --BUGGY NEW--
; --BUGGY NEW--	dec	num_left_over
; --BUGGY NEW--	jne	NDEP_loop
; --BUGGY NEW--
; --BUGGY NEW--	jmp	_none_to_do

NewDoEndPixels:
	mov	esi,ebx

	align	4
NDEP_loop:

; compute v coordinate
	mov	eax, ebp	; get v
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (v/z)

	and	eax,3fh	; mask with height-1
	mov	ebx,eax

; compute u coordinate
	mov	eax, 	esi	; get u
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (u/z)

	shl 	eax,26
	shld 	ebx,eax,6	; esi = v*64+u

; read 1  pixel
	xor	eax, eax
	mov	al, es:[ebx]	; get pixel from source bitmap

; lighting code
	mov	ebx, _fx_l	; get temp copy of lighting value
	mov	ah, bh	; get lighting level
	add	ebx, _fx_dl_dx	; update lighting value
	mov	_fx_l, ebx	; save temp copy of lighting value

; transparency check
	cmp	al,255
	je	NDEP_skip2	; this pixel is transparent, so don't write it (or light it)

	mov	al, _gr_fade_table[eax]	; xlat pixel thru lighting tables
					
; write 1 pixel
	mov	[edi],al
NDEP_skip2:	inc	edi
	
; update deltas
	add	ebp,_fx_dv_dx
	add	esi,_fx_du_dx
	add	ecx,_fx_dz_dx
	je	_div_0_abort	; would be dividing by 0, so abort

	dec	num_left_over
	jne	NDEP_loop

	jmp	_none_to_do

; ==================================================== No Lighting Code ======================================================
public tmap_loop_fast_nolight
tmap_loop_fast_nolight:
	mov	esi,ebx

	align	4
NotDwordAligned1_nolight:
	test	edi, 11b
	jz	DwordAligned1_nolight

; compute v coordinate
	mov	eax,ebp	; get v
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (v/z)

	and	eax,3fh	; mask with height-1
	mov	ebx,eax

; compute u coordinate
	mov	eax, esi	; get u
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (u/z)

	shl 	eax,26
	shld 	ebx,eax,6	; esi = v*64+u

; read 1  pixel
	mov	al,es:[ebx]	; get pixel from source bitmap

; write 1 pixel
	cmp	al,255
	je	skip6
	mov	[edi],al
skip6:	inc	edi
	
; update deltas
	add	ebp,_fx_dv_dx
	add	esi,_fx_du_dx
	add	ecx,_fx_dz_dx
	je	_div_0_abort	; would be dividing by 0, so abort

	dec	_loop_count
	jns	NotDwordAligned1_nolight
	jmp	_none_to_do

DwordAligned1_nolight:
	mov	ebx,esi

	mov	eax, _loop_count
	inc	eax
	mov	num_left_over, eax
	shr	eax, NBITS

	test	eax, -1
	je	tmap_loop	; no 2^NBITS chunks, do divide/pixel for whole scanline
	
	mov 	_loop_count, eax	; _loop_count = pixels / NPIXS
	shl	eax, NBITS
	sub	num_left_over, eax	; num_left_over = obvious
		
; compute initial v coordinate
	mov	eax,ebp	; get v
	PDIV
	mov	V0, eax

; compute initial u coordinate
	mov	eax,ebx	; get u
	PDIV	
	mov	U0, eax

; Set deltas to NPIXS pixel increments
	mov	eax, _fx_du_dx
	shl	eax, NBITS
	mov	DU1, eax
	mov	eax, _fx_dv_dx
	shl	eax, NBITS
	mov	DV1, eax
	mov	eax, _fx_dz_dx
	shl	eax, NBITS
	mov	DZ1, eax

	align	4
TopOfLoop4_nolight:
	add	ebx, DU1
	add	ebp, DV1
	add	ecx, DZ1
	je	_div_0_abort

; Done with ebx, ebp, ecx until next iteration
	push	ebx
	push	ecx
	push	ebp
	push	edi
	
; Find fixed U1		
	mov	eax, ebx
	PDIV
	mov	ebx, eax	; ebx = U1 until pop's

; Find fixed V1		
	mov	eax, ebp
	PDIV
	mov	ebp, eax	; ebp = V1 until pop's

	mov	ecx, U0	; ecx = U0 until pop's
	mov	edi, V0	; edi = V0 until pop's

; Make ESI =  V0:U0 in 6:10,6:10 format
	mov	eax, ecx
	shr	eax, 6
	mov	esi, edi
	shl	esi, 10
	mov	si, ax
		
; Make EDX = DV:DU in 6:10,6:10 format
	mov	eax, ebx
	sub	eax, ecx
	sar	eax, NBITS+6
	mov	edx, ebp
	sub	edx, edi
	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
	mov	dx, ax	; put delta u in low word

; Save the U1 and V1 so we don't have to divide on the next iteration
	mov	U0, ebx
	mov	V0, ebp

	pop	edi	; Restore EDI before using it
		
    REPT (1 SHL (NBITS-2))
	local	skip7, no_trans2, skip1q, skip2q, skip3q, skip4q

; Do 1 pixel 
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
	mov	cl, es:[eax]	; load into buffer register
;;;	ror	ecx, 8	; move to next dest pixel

	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	mov	ch, es:[eax]	; load into buffer register
	add	esi, edx	; inc u,v
	ror	ecx, 16	; move to next dest pixel

	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	mov	cl, es:[eax]	; load into buffer register
	add	esi, edx	; inc u,v
;;;	ror	ecx, 8	; move to next dest pixel

	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	mov	ch, es:[eax]	; load into buffer register
	add	esi, edx	; inc u,v
	ror	ecx, 16 ;-- can get rid of this, just write in different order below -- 	; move to next dest pixel

	test	_transparency_on,-1
	je	no_trans2
	cmp	ecx,-1
	je	skip7

	cmp	cl,255
	je	skip1q
	mov	[edi],cl
skip1q:

	cmp	ch,255
	je	skip2q
	mov	1[edi],ch
skip2q:
	ror	ecx,16

	cmp	cl,255
	je	skip3q
	mov	2[edi],cl
skip3q:


	cmp	ch,255
	je	skip4q
	mov	3[edi],ch
skip4q:

;;	wr_onepix	0
;;	wr_onepix	1
;;	wr_onepix	2
;;	wr_onepix	3
	jmp	skip7
no_trans2:
	mov 	[edi],ecx	; Draw 4 pixels to display
skip7:	add 	edi,4

    ENDM

	pop	ebp
	pop	ecx
	pop	ebx
	dec	_loop_count
	jnz	TopOfLoop4_nolight

EndOfLoop4_nolight:

	test	num_left_over, -1
	je	_none_to_do

DoEndPixels_nolight:
	add	ebx, DU1
	add	ebp, DV1
	add	ecx, DZ1
	je	_div_0_abort
	push	edi	; use edi as a temporary variable

; Find fixed U1		
	mov	eax, ebx
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	ebx, eax	; ebx = U1 until pop's

; Find fixed V1		
	mov	eax, ebp
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	ebp, eax	; ebp = V1 until pop's

	mov	ecx, U0	; ecx = U0 until pop's
	mov	edi, V0	; edi = V0 until pop's

; Make ESI =  V0:U0 in 6:10,6:10 format
	mov	eax, ecx
	shr	eax, 6
	mov	esi, edi
	shl	esi, 10
	mov	si, ax
		
; Make EDX = DV:DU in 6:10,6:10 format
	mov	eax, ebx
	sub	eax, ecx
	sar	eax, NBITS+6
	mov	edx, ebp
	sub	edx, edi
	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
	mov	dx, ax	; put delta u in low word

	pop	edi	; Restore EDI before using it
		
	mov	ecx, num_left_over

    ITERATION = 0
    REPT (1 SHL NBITS)
	local	skip8
; Do 1 pixel 
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	mov	al, es:[eax]	; load into buffer register
	add	esi, edx	; inc u,v
	cmp	al,255
	je	skip8
	mov	[edi+ITERATION], al	; write pixel
skip8:	dec	ecx
	jz	_none_to_do
    ITERATION = ITERATION + 1
    ENDM

; Should never get here!!!!!
	int	3
	jmp	_none_to_do

	

_TEXT	ends

	end

