;THE COMPUTER CODE CONTAINED HEREIN IS THE SOLE PROPERTY OF PARALLAX
;SOFTWARE CORPORATION ("PARALLAX").  PARALLAX, IN DISTRIBUTING THE CODE TO
;END-USERS, AND SUBJECT TO ALL OF THE TERMS AND CONDITIONS HEREIN, GRANTS A
;ROYALTY-FREE, PERPETUAL LICENSE TO SUCH END-USERS FOR USE BY SUCH END-USERS
;IN USING, DISPLAYING,  AND CREATING DERIVATIVE WORKS THEREOF, SO LONG AS
;SUCH USE, DISPLAY OR CREATION IS FOR NON-COMMERCIAL, ROYALTY OR REVENUE
;FREE PURPOSES.  IN NO EVENT SHALL THE END-USER USE THE COMPUTER CODE
;CONTAINED HEREIN FOR REVENUE-BEARING PURPOSES.  THE END-USER UNDERSTANDS
;AND AGREES TO THE TERMS HEREIN AND ACCEPTS THE SAME BY USE OF THIS FILE.  
;COPYRIGHT 1993-1998 PARALLAX SOFTWARE CORPORATION.  ALL RIGHTS RESERVED.
;
; $Source: f:/miner/source/texmap/rcs/tmap_16.asm $
; $Revision: 1.4 $
; $Author: mike $
; $Date: 1994/11/30 00:56:32 $
;
; 16 bits per pixel texture mapper
;
; $Log: tmap_16.asm $
; Revision 1.4  1994/11/30  00:56:32  mike
; optimization.
; 
; Revision 1.3  1994/11/12  16:39:16  mike
; jae to ja.
; 
; Revision 1.2  1993/11/22  10:23:49  mike
; *** empty log message ***
; 
; Revision 1.1  1993/09/08  17:29:17  mike
; Initial revision
; 
;
;

	.386

	public	asm_tmap_scanline_lin_16_

	include	tmap_inc.asm

_DATA	SEGMENT DWORD PUBLIC USE32 'DATA'

	extrn	_fx_u:dword
	extrn	_fx_v:dword
	extrn	_fx_du_dx:dword
	extrn	_fx_dv_dx:dword
	extrn	_fx_y:dword
	extrn	_fx_xleft:dword
	extrn	_fx_xright:dword

	extrn	_pixptr:dword

	extrn	_x:dword
	extrn	_loop_count:dword


_DATA	ENDS

DGROUP	GROUP	_DATA

_TEXT   SEGMENT PARA PUBLIC USE32 'CODE'

	ASSUME	DS:_DATA
	ASSUME	CS:_TEXT

; --------------------------------------------------------------------------------------------------
; Enter:
;	_xleft	fixed point left x coordinate
;	_xright	fixed point right x coordinate
;	_y	fixed point y coordinate
;	_pixptr	address of source pixel map
;	_u	fixed point initial u coordinate
;	_v	fixed point initial v coordinate
;	_du_dx	fixed point du/dx
;	_dv_dx	fixed point dv/dx

;   for (x = (int) xleft; x <= (int) xright; x++) {
;      _setcolor(read_pixel_from_tmap(srcb,((int) (u/z)) & 63,((int) (v/z)) & 63));
;      _setpixel(x,y);
;
;      u += du_dx;
;      v += dv_dx;
;      z += dz_dx;
;   }

	align	4
asm_tmap_scanline_lin_16_:
	pusha

; Setup for loop:	_loop_count  iterations = (int) xright - (int) xleft
;	esi	source pixel pointer = pixptr
;	edi	initial row pointer = y*320+x

; set esi = pointer to start of texture map data
	mov	esi,_pixptr

; set edi = address of first pixel to modify
	mov	edi,_fx_y
	cmp	edi,_window_bottom
	ja	_none_to_do

	sub	edi,_window_top
	imul	edi,_bytes_per_row

	mov	eax,_fx_xleft
	sar	eax,16
	jns	eax_ok
	sub	eax,eax
eax_ok:
	sub	eax,_window_left

	add	edi,eax
	add	edi,eax	; add again because it's 2 bytes/pixel
	add	edi,write_buffer

; set _loop_count = # of iterations
	mov	eax,_fx_xright
	sar	eax,16
	cmp	eax,_window_right
	jb	eax_ok1
	mov	eax,_window_right
eax_ok1:	cmp	eax,_window_left
	ja	eax_ok2
	mov	eax,_window_left
eax_ok2:

	mov	ebx,_fx_xleft
	sar	ebx,16
	sub	eax,ebx
	js	_none_to_do
	cmp	eax,_window_width
	jbe	_ok_to_do
	mov	eax,_window_width
_ok_to_do:
	mov	_loop_count,eax

;	edi	destination pixel pointer


	mov	ebx,_fx_u
	mov	ecx,_fx_du_dx
	mov	edx,_fx_dv_dx
	mov	ebp,_fx_v

	shl	ebx,10
	shl	ebp,10
	shl	edx,10
	shl	ecx,10

; eax	work
; ebx	u
; ecx	du_dx
; edx	dv_dx
; ebp	v
; esi	read address
; edi	write address

_size = (_end1 - _start1)/num_iters
	mov	eax,num_iters-1
	sub	eax,_loop_count
	imul	eax,eax,dword ptr _size
	add	eax,offset _start1
	jmp	eax

	align	4
_start1:

; "OPTIMIZATIONS" maybe not worth making
;    Getting rid of the esi from the mov al,[esi+eax] instruction.
;       This would require moving into eax at the top of the loop, rather than doing the sub eax,eax.
;       You would have to align your bitmaps so that the two shlds would create the proper base address.
;       In other words, your bitmap data would have to begin at 4096x (for 64x64 bitmaps).
;       I did timings without converting the sub to a mov eax,esi and setting esi to the proper value.
;       There was a speedup of about 1% to 1.5% without converting the sub to a mov.
;    Getting rid of the edi by doing a mov nnnn[edi],al instead of mov [edi],al.
;       The problem with this is you would have a dword offset for nnnn.  My timings indicate it is slower.  (I think.)
;    Combining u,v and du,dv into single longwords.
;       The problem with this is you then must do a 16 bit operation to extract them, and you don't have enough
;       instructions to separate a destination operand from being used by the next instruction.  It shaves out one
;       register instruction (an add reg,reg), but adds a 16 bit operation, and the setup is more complicated.
; usage:
;	eax	work
;	ebx	u coordinate
;	ecx	delta u
;	edx	delta v
;	ebp	v coordinate
;	esi	pointer to source bitmap
;	edi	write address
 rept num_iters
	mov	eax,ebp	; clear for 
	add	ebp,edx	; update v coordinate
	shr	eax,26	; shift in v coordinate
	shld	eax,ebx,6	; shift in u coordinate while shifting up v coordinate
	add	ebx,ecx	; update u coordinate
	mov	ax,[esi+2*eax]	; get pixel from source bitmap
	mov	[edi],ax
	add	edi,2

; inner loop if bitmaps are 256x256
; your register usage is bogus, and you must clear ecx
; fix your setup
; this is only about 10% faster in the inner loop
; this method would adapt to writing two pixels at a time better than
; the 64x64 method because you wouldn't run out of registers
; Note that this method assumes that both dv_dx and du_dx are in edx.
; edx = vi|vf|ui|uf
; where each field is 8 bits, vi = integer v coordinate, vf = fractional v coordinate, etc.
;** add ebx,edx
;** mov cl,bh
;** shld cx,bx,8
;** mov al,[esi+ecx]
;** mov [edi],al
;** inc edi
 endm

_end1:

_none_to_do:	popa

	ret

_TEXT	ends

	end



; This is the inner loop to write two pixels at a time
; This is about 2.5% faster overall (on Mike's 66 MHz 80486 DX2, VLB)
; You must write code to even align edi and do half as many iterations, and write
; the beginning and ending extra pixels, if necessary.
;	sub	eax,eax	; clear for 
;	shld	eax,ebp,6	; shift in v coordinate
;	add	ebp,_fx_dv_dx	; update v coordinate
;	shld	eax,ebx,6	; shift in u coordinate while shifting up v coordinate
;	add	ebx,ecx	; update u coordinate
;	mov	dl,[esi+eax]	; get pixel from source bitmap
;
;	sub	eax,eax	; clear for 
;	shld	eax,ebp,6	; shift in v coordinate
;	add	ebp,_fx_dv_dx	; update v coordinate
;	shld	eax,ebx,6	; shift in u coordinate while shifting up v coordinate
;	add	ebx,ecx	; update u coordinate
;	mov	dh,[esi+eax]	; get pixel from source bitmap
;
;	mov	[edi],dx
;	add	edi,2


