; This example main DSP program shows how to handle DMA buffers
; to and from the DSP synchronously.  In "synchronous mode", input
; and output DMA transfers strictly alternate.
;
; J.O. Smith - April 1991
; 
	page 255,255,0,1,1	   	; Width, height, topmar, botmar, lmar
	opt nomd,mex,cex,mi,xr,s	; Default assembly options
	nolist
	include 'ioequ.asm'
	list
;
;------------- DMA Configuration Constants --------------------
;
DMA_READ_SIZE	equ	$0800	; Size of each DMA transfer in 
DMA_WRITE_SIZE	equ	$0800	; Size of each DMA transfer out
READ_BUF1	equ	$2000	; First input buffer
READ_BUF2	equ	$2800	; Second input buffer
WRITE_BUF1	equ	$3000	; First output buffer
WRITE_BUF2	equ	$3800	; Second output buffer
;
; The following three register definitions MAY NOT be R0
;
	define	R_DMA 'R3'	; Dedicated to DMA buffer i/o service
	define	N_DMA 'N3'	; Unused
	define	M_DMA 'M3'	; -1

UNSYNCH set 0
	if UNSYNCH
		warn 'SYNCHED mode TURNED OFF'
		define	R_DMA_IN 'R6'	; Dedicated to DMA in
		define	N_DMA_IN 'N6'	; Unused
		define	M_DMA_IN 'M6'	; -1

		define	R_DMA_OUT 'R7'	; Dedicated to DMA out
		define	N_DMA_OUT 'N7'	; Unused
		define	M_DMA_OUT 'M7'	; -1
	endif

;-------------- Memory Configuration Constants --------------------

Y_DMA_TOP	equ 32		; max Y variables for fast_dma_support
Y_BUF_LOC 	equ Y_DMA_TOP	; place to put unpacked DCT bytes in y
Y_VARIABLES 	equ Y_BUF_LOC+64

	org y:Y_VARIABLES
Y_STATUS 	dc 0			; status word
Y_BLOCK_PTR_IN	dc 0
Y_BLOCK_PTR_OUT	dc 0

; Status bits
B_STARTED	equ 0		; set after first buffer

	org p:0			; Reset vector (Bug56 will not accept)
	jmp >reset

	org p:$A0		; See fast_dma_support.asm for vectors used
SYNCHED_VERSION  set 1-UNSYNCH	; Configure fast_dma_support.asm for this case
	include 'fast_dma_support.asm'

reset	movec   #6,omr			; Data rom enabled, mode 2 = "normal"
	move	#0,sp			; clear stack
	bset    #0,x:m_pbc		; Enable host port
	bset	#3,x:m_pcddr		;    pc3 asserts 0 to enable external
	bclr	#3,x:m_pcd		;    DSP ram on very early machines
	movep   #>$000000,x:m_bcr	; No wait states for the external sram
        movep   #>$00B400,x:m_ipr  	; Intr levels: SSI=2, SCI=1, HOST=0
	move	#0,sr			; go to lowest int. priority level
	jmp 	main
;
;	Loop over dma buffers
;
;	The way we wait for DMA completions is to await the clearing of the
;	pending DMA request in the four-word DMA Request Queue.  The four
;	request slots are used as follows:
;
;	First pass: 	Read1,Read2,Write1,Read3
;	Later passes:	Write(N),Read(N+2),Write(N+1),Read(N+3), N=2,4,6,...
;
;	The word in the Q is written by enqueue_dma_{read,write}, and it
;	is cleared by dequeue_dma (called by the DMA-complete host command).
;	Thus, the existence of a word in the DMA Request Queue provides a
;	semaphore for determining when a DMA request is satisfied.
;
;	The blocking logic is as follows, for N=0,2,4,6,...:
;
;	Await slot 0 which contains Write(N) for N>0 or Read(1) for N=0.
;	if (N==0) {
;		request Read(2) in slot 1
;		Process Buffer(1);	request Write(1) in slot 2}
;	Await slot 1 = Read(N+2);	request Read(N+3) in slot 3
;	Process Buffer(N+2);		request Write(N+2) in slot 0
;
;	Await slot 2 = Write(N+1)
;	Await slot 3 = Read(N+3);	request Read(N+4) in slot 1
;	Process Buffer(N+3);		request Write(N+3) in slot 2
;	

;------------------------------ Utility Macros -----------------------------

make_goto macro op,addr
	if 1
	  op addr
	else
	  op <addr ; must be in first 64 words for jsxxx, 4096 for jmp, etc.
	endif
	endm

set_input macro i
	move #i,R0
	move R0,y:Y_BLOCK_PTR_IN	; block_loop input pointer
	endm

set_output macro o
	move #o,R0
	move R0,y:Y_BLOCK_PTR_OUT	; block_loop output pointer
	endm

await_slot macro s
	move #s,R7
	make_goto jsr,await_zero_y	; wait for word in slot s to be 0
	endm

;----------------------------- Main Loop -----------------------------------

main
	readWordHost Y0		; read scale factor (programmed i/o)
	if DEBUG_VERSION
	  move #0.5,A 		; 1/2 in DSP number format
	  cmp Y0,A
	  jsne abort
	endif
	make_goto jsr,dma_start		; start first DMA read

main_loop
	await_slot 0			; wait for Read(1)
	bset #B_STARTED,y:Y_STATUS	; on first iteration, bit is 0
	make_goto jcc,short		; must conserve code space
	await_slot 1			; wait for Read(N+2)

        make_goto jsr,enqueue_dma_read	; enqueue Read(N+3) in slot 3

	set_input READ_BUF2
	set_output WRITE_BUF2
	make_goto jsr,block_loop	; unpack DCT data, IDCT it, and repack

	await_slot 2			; wait for Write(N+1)
	await_slot 3			; wait for Read(N+3)

short   make_goto jsr,enqueue_dma_read	; start next DMA read (Slot 1)

	set_input READ_BUF1
	set_output WRITE_BUF1
	make_goto jsr,block_loop	; Do Buffer(N+3); Write(N+3) to Slot 2

 	make_goto jmp,main_loop

;----------------------------- Block Loop ----------------------------------
block_loop
	move y:Y_BLOCK_PTR_IN,R0	; input address
	move y:Y_BLOCK_PTR_OUT,R7	; output address
 	do #DMA_READ_SIZE,ltbloop
	    move y:(R0)+,Y1		; input word
	    mpy Y0,Y1,A			; apply scale factor
	    move A,y:(R7)+		; output word
ltbloop	
 	make_goto jsr,enqueue_dma_write	; start new DMA write
	rts
;---------------------------------------------------------------------------
	if DEBUG_VERSION
abort	  set_bit	hf2,hcr	; abort code = HF2 and HF3
	  set_bit	hf3,hcr	; abort code = HF2 and HF3
	  jmp *
	endif
;---------------------------------------------------------------------------
; await_zero_y - block until y:(R7) == 0
;
; ARGUMENTS
;	R7 = address of word in Y memory to await going to zero
;
; REGISTERS CLOBBERED
;	A
;
await_zero_y
	move y:(R7),A
	tst A
	jne await_zero_y
	rts
;---------------------------------------------------------------------------

	if *>$200
	  fail 'On-chip program memory overflow' ; best to check
	endif

	end reset
