; fast_dma_support.asm - Time-efficient DSP DMA read and write support.
;
; Author: J. O. Smith (jos@next.com)
; NeXT Computer Inc.
; January 1991
; Last updated 7/24/91.
;
 if 0 ;; ************************* BEGIN LONG COMMENT ********************

This DSP "include" file provides DSP subroutines which support streaming
input/output, such as for audio or video data, into and out of the DSP.


HOW TO USE THIS FILE

Study the example main program below.  Special notes:

The file ioequ.asm, Motorola's standard "I/O equates" file, must be
included before this one.

There are some DMA configuration constants that must be defined in the
including DSP program.  See the list below.

The first few locations in on-chip Y RAM are used for DMA control.

Three index registers are reserved for I/O service.  This can be reduced
to one when necessary.


EXPORTED ROUTINES

This file defines six I/O macros for the including program to invoke:

	readWordHost  - read next word from host (programmed I/O)
	writeWordHost - write next word to host (programmed I/O)

The above two utilities support word-at-a-time i/o.  The corresponding C
functions are snddriver_dsp_write() and snddriver_dsp_read(), respectively.
DMA is not used for these reads and writes.  Any data mode can be used by
the host (8-bit, 16-bit, or 24-bit modes).  The following utilities provide
support for DMA transfers:

	readWordDMA   - read next word from input DMA stream (host to DSP)
	writeWordDMA  - write next word to output DMA stream (DSP to host)

	readShortDMA  - read next short from input DMA stream (host to DSP)
	writeShortDMA - write next short to output DMA stream (DSP to host)

Words are supplied from and written to the streaming I/O DMA buffers in DSP
memory.  When an input buffer is empty, or an output buffer is full, the
DSP initiates a DMA buffer transfer by writing a command word to the host.
Two buffers are used in each direction so that the DSP can go on reading
and writing internally while the host starts up and completes the requested
DMA transfer.

See synched_buffers_example.asm for a more efficient substitute for the
above DMA routines in the "synchronous case" where input and output streams
are synchronously related.  In the synchronous case, the DSP program
initiates each DMA transfer at the appropriate point in the processing,
avoiding the overhead of an I/O call for each word and (perhaps most
importantly) freeing up two of the three reserved index registers.
Moreover, the amount of program memory used for I/O is reduced.

The subroutine 

	dma_start   - initialize DMA buffer pointers and status bits

should be called (via jsr) before invoking the I/O macros.  This routine
requests the first transfer into the DSP and awaits its completion.  Thus,
when it returns, the first read-buffer is ready to go.  On the first read
by the DSP program, the second DMA read is requested. The first DMA write
is requested when the first output buffer is filled, and so on.


CONFIGURATION CONSTANTS

The including program must define configuration constants such as in the
following example.  The example below uses all of external memory for the
input and output buffers.

;---------------------- Begin DMA Configuration Constants --------------------
;
DMA_READ_SIZE	equ	$0800	; Size of each DMA transfer in 
DMA_WRITE_SIZE	equ	$0800	; Size of each DMA transfer out
READ_BUF1	equ	$2000	; First input buffer
READ_BUF2	equ	$2800	; Second input buffer
WRITE_BUF1	equ	$3000	; First output buffer
WRITE_BUF2	equ	$3800	; Second output buffer

;
; The following register definitions MAY NOT be R0
;
	define	R_DMA 'R7'	; Dedicated to DMA buffer i/o service
	define	N_DMA 'N7'	; Unused
	define	M_DMA 'M7'	; -1

	define	R_DMA_IN 'R5'	; Input sample pointer (for data from host)
	define	N_DMA_IN 'N5'	; Unused
	define	M_DMA_IN 'M5'	; 2*DMA_READ_SIZE-1

	define	R_DMA_OUT 'R6'	; Output sample pointer (for data to host)
	define	N_DMA_OUT 'N6'	; Unused
	define	M_DMA_OUT 'M6'	; 2*DMA_WRITE_SIZE-1

Note that R_DMA_IN and R_DMA_OUT are only used by readWordDMA and
writeWordDMA, respectively.  If these macros are not used, only one index
register (R_DMA) need be dedicated to I/O service.  Take a look at these
macros to see how to get around using them.  The basic idea is to call
enqueue_dma_read and enqueue_dma_write yourself, (explicitly in your DSP
program), and block yourself to await the filling of the previously
requested read or write buffer when necessary.  This approach not only
saves index registers, but it should be faster because you can process a
whole buffer in a DO loop, using auto-incrementing indices, rather than
fetching and putting each word one at a time from/to the current DMA
buffers.  The four slots in the DMA request queue can be used as
semaphores: They are written by enqueue_dma and cleared by dequeue_dma (on
completion).  See synched_buffers_example.asm in this directory for an
example of this approach.

BUG56_VERSION 	set 0		; 1 if loading into Bug56, 0 to try for real
DEBUG_VERSION 	set 0		; 1 to emit extra error-checking code
;
;----------------------- End DMA Configuration Constants ---------------------


MAIN PROGRAM EXAMPLE
; ---------------------------------------------------------------------------
; Example program which reads a shift count and applies it to the data stream
; See dsp_dma_stream.asm for what could suffice for "basic_reset.asm".
;
	include 'ioequ.asm'	  		  ; standard equates
	include 'dma_configuration_constants.asm' ; configure DMA as above
	define START_ADDRESS '$30'	; Make room for interrupt vectors
	org p:0				; Reset vector (Bug56 will not accept)
reset	jmp start

	org p:START_ADDRESS
	include 'fast_dma_support.asm'	; this file
start	include 'basic_reset.asm'	; Set up omr, sp, bcr, sr, ipr, etc.
	readWordHost N0		; get a shift count via programmed I/O
	jsr dma_start		; initialize DMA and start first read to DSP
loop	readWordDMA A		; get next input sample
	rep N0			; do the shift
	  lsl A			;   by desired number of bits
	writeWordDMA A		; output word
	jmp loop		; and loop back for next
	end reset
; ---------------------------------------------------------------------------


PERFORMANCE

Using the "xpr" tracing facility in a debug kernel, this example has been
measured to transfer 1.4 megabytes per second, sustained, on a NeXTstation.
That's 700 kBytes/Sec in each direction.  This is comfortably faster than
the SCSI disk can go.  The DMA size in this case is 4096 bytes per transfer
in 16-bit mode.  Each burst is going at about 2.6 megabytes/sec, with the
remaining time going to stream management in the driver.  Less than half of
the CPU is consumed at this rate.  Doubling the DSP buffer size cuts the
CPU loading to 1/3, quadrupling gives 1/5, etc.  These numbers are quite
pessimistic due to xpr tracing.  It is known that the xpr tracing slows CPU
performance drastically.


OUTLINE OF PROTOCOL BETWEEN DSP AND HOST

Host boots DSP main program which includes this code.
DSP program calls readWordHost to receive any initial parameters.
After last parameter is written, DSP program calls dma_start to start DMA.
DSP requests each DMA transfer.
Host terminates each DMA transfer with host command.
One extra buffer must be sent by host to flush out the last buffer.

The HC bit will always be clear when the host needs to write a host command
after a DMA completes, so you never have to worry about blocking on that.

If the DSP needs more time, it will hang the DMA flow, whether it's input
or output.  The host need not think about flow control.

At the moment there is no provision for stopping the data flow once it is
started.  This would be easy to add. (See simple_dma_support.asm.)


USE WITH BUG56

In the main DSP program or Makefile, set BUG56_VERSION to something
nonzero.  Load the assembled .lod file in Bug56, and start stepping or
running.  Basically, things work as in real life except that HRIE and HTIE
are not set to enable DMA flow.  Instead, the read or write is manually
carried out at that point via the subroutine do_dma_read or do_dma_write
(unless the BUG56_BLOCK flag is set in which case the DSP awaits I/O as
usual).

Manually doing the read means you must write the host port by hand using
Bug56's "Write Host Port" menu item.  In such a case, it is good to have
set the DMA buffer sizes to a small number for debugging.  A write appears
in the Bug56 "Host Port Log".

On DSP debugging in general, note that the status bits at y:Y_DMA_STATUS
(as well as all other DSP memory) persist even after a DSP reset.  A common
debugging technique is to assemble with the DEBUG_VERSION flag set (to get
extra status bit support and to leave room for the Bug56 monitor), use the
program normally, (i.e., not under Bug56), then at some point kill the
program, launch Bug56 (or unhide it), "load symbols only" from the .lod
file, and inspect the status bits (and whatever else).  In some situations
it is effective to place the statement "halt jmp halt" at a strategic point
in the DSP program, use it normally, wait until it hangs, then kill the
program and step in with Bug56.

 endif ;; ************************** END LONG COMMENT *********************

	if !@def(BUG56_VERSION)
BUG56_VERSION set 0		; 1 to get version to load into Bug56
	endif

	if !@def(BUG56_BLOCK)	; Set 0 for programmed I/O, 1 for DMA sim
BUG56_BLOCK set 1		; 1 if blocking on I/O in Bug56 version
	endif

	if BUG56_VERSION
	  msg 'BUG56 version'
;*FIXME*sndconvert bug prohibits internal comments*  cobj 'BUG56 version'
	endif

	if !@def(DEBUG_VERSION)
DEBUG_VERSION	  set 0			; 1 to get extra error checking
	endif

	if DEBUG_VERSION
	  msg 'DEBUG version'
;*FIXME*sndconvert bug prohibits internal comments* cobj 'DEBUG version'
	endif

	if !@def(VIDMAIL_VERSION)
VIDMAIL_VERSION set 0		; special protocol used by vidmail dsp driver
	endif

	if VIDMAIL_VERSION
	  msg 'VIDMAIL version'
;*FIXME*sndconvert bug prohibits internal comments* cobj 'VIDMAIL version' 
SYNCHED_VERSION set 1		; required for VIDMAIL version
	endif

;; The SYNCHED version only supports a synchronous data
;; protocol in which DMA reads and writes strictly
;; alternate.  There is no API for getting messages in and out of the DSP
;; between data buffers, nor is there a way to abort other than by resetting
;; the DSP.  As a result, we get maximally simple and efficient i/o.

	if !@def(SYNCHED_VERSION)
SYNCHED_VERSION set 0
	endif

	if SYNCHED_VERSION
	  msg 'SYNCHED version'
;*FIXME*sndconvert bug prohibits internal comments* cobj 'SYNCHED version' 
	endif


dma_s_saved_lc	set *

;------------------------- Y memory locations used ----------------------

	org y:0

Y_DMAQ_A0	dc 0	; Enqueued DMA request
Y_DMAQ_A1	dc 0	; Enqueued DMA request
Y_DMAQ_A2	dc 0	; Enqueued DMA request
Y_DMAQ_A3	dc 0	; Enqueued DMA request

; The status word cannot be moved higher than the first 64 words.
; (Instructions like btst require a short absolute address.)
Y_DMA_STATUS	dc 0	; On-chip Y memory word used for status bits
Y_DMAQ_FREE	dc 0	; Pointer to first empty slot in DMA Q
Y_DMAQ_NEXT	dc 0	; Pointer to first nonempty slot in DMA Q or 0
Y_DMA_ARG	dc 0	; DMA descriptor argument
Y_SYSCALL	dc 0	; On-chip Y memory word used for syscall arg

; DMA state variables

Y_READ_TRIGGER		dc 0	; ptr value at which dma request goes out
Y_WRITE_TRIGGER		dc 0	; ptr value at which dma request goes out
; In SYNCHED_VERSION, the read and write triggers are used instead to keep
; track of which buffer is which.

	if !SYNCHED_VERSION
Y_LAST_READ_ADDRESS	dc 0	; ptr to last ready element of read buffers
Y_LAST_WRITE_ADDRESS 	dc 0	; ptr to last empty element of write buffers
	endif

	if DEBUG_VERSION
Y_DMAQ_LOG_PTR		dc *+1
Y_DMAQ_LOG		dc 0
;	...
	endif

Y_DMA_TOP_REAL		equ *	; Pointer to top of Y memory used.
	if @def(Y_DMA_TOP)
	  if Y_DMA_TOP_REAL>Y_DMA_TOP
	    fail 'Too many on-chip Y variables used by fast_dma_support.asm'
	  endif
	else
Y_DMA_TOP equ Y_DMA_TOP_REAL+32	; Allocate DMA request and DQ log
	endif

;------------------------- Bit fields in the status word ----------------------
;
B_DMA_ACTIVE 	equ 0	  	;  $1 - set when DMA is active
B_SYS_CALL	equ 1 		;  $2 - indicates sys call has been received
B_ABORTING	equ 2		;  $4 - indicates abort host command received
B_LAST_W_BUF	equ 3		;  $8 - set after abort during last input buf
B_IDLE   	equ 4		; $10 - abort complete (looked at using Bug56)
	if DEBUG_VERSION
B_REGS_SAVED 	equ 5 		; $20 - set when regs saved (DEBUG_VERSION)
B_READ_BLOCK 	equ 6 		; $40 - set when blocking until READ complete
B_WRITE_BLOCK 	equ 7 		; $80 - set when blocking until WRITE complete
B_READ_FLOWING	equ 8		;$100 - set when READ is actively flowing
B_WRITE_FLOWING	equ 9		;$200 - set when WRITE is actively flowing
B_AWAIT_NOT_HF1	equ 10		;$400 - set when waiting for hf1 to go low
B_AWAIT_HF1	equ 11		;$800 - set when waiting for hf1 to go high
	endif
B_ERROR		equ 23		;$800000 - set on internal abort due to err

set_status macro bit
	bset #bit,y:Y_DMA_STATUS
	endm

clear_status macro bit
	bclr #bit,y:Y_DMA_STATUS
	endm

test_status macro bit
	btst #bit,y:Y_DMA_STATUS
	endm

jsset_status macro bit,subr
	jsset #bit,y:Y_DMA_STATUS,subr
	endm

jsclr_status macro bit,subr
	jsclr #bit,y:Y_DMA_STATUS,subr
	endm

set_bit macro bit,reg
	bset #m_\bit,x:m_\reg
	endm

clear_bit macro bit,reg
	bclr #m_\bit,x:m_\reg
	endm

; asm_goto - assemble short absolute jump address in common-denominator case.
; usage: asm_goto <JMP,Jxx,JSR,JSxx>,label
;  6-bit Absolute Short addresses: jclr,jsclr,jset,jsset
; 12-bit Absolute Short addresses: jxx,jmp,jsxx,jsr
; The DSP assembler should figure this out!
asm_goto macro op,addr
	if *<64
	  op <addr ; must be in first 64 words for JScc, 4K for JMP, Jcc etc.
	else
	  op addr
	endif
	endm

; asm_lgoto - (long version) assemble short absolute jump address in 4K case
; usage: asm_goto <JMP,Jxx,JSR,JSxx>,label
asm_lgoto macro op,addr
	if *<4096
	  op <addr
	else
	  op addr
	endif
	endm

;
;------------------------------- Message codes ----------------------------
;
SC_W_REQ	equ	$020002	  ;"Sys call" requesting DMA write on chan 2
DM_R_REQ	equ	$050001	  ;"DSP message" requesting DMA read on chan 1
DM_W_REQ	equ	$040002	  ;message requesting DMA write on channel 2 
;
;------------------------------- Interrupt vectors ----------------------------
;
VEC_HOST_RCV	equ	$0020	  ;host receive interrupt vector
VEC_HOST_XMT	equ	$0022	  ;host transmit interrupt vector
VEC_W_DONE	equ	$0024	  ;host command saying dma from dsp complete
VEC_R_DONE	equ	$0028	  ;host command saying dma to dsp complete
VEC_SYS_CALL	equ	$002C	  ;host command indicating sys-call int coming
VEC_ABORT 	equ 	$002E	  ;host command indicating external abort

	org	p:VEC_HOST_RCV
iv_hr	movep 	x:m_hrx,y:(R_DMA)+	; DMA write to external memory
	nop

	org	p:VEC_HOST_XMT
iv_hx	movep 	y:(R_DMA)+,x:m_htx	; DMA read from x data memory
	nop

	org	p:VEC_R_DONE
iv_rc	jsr	>dma_read_complete

	org	p:VEC_W_DONE
iv_wc	jsr	>dma_write_complete

	if !VIDMAIL_VERSION

	org	p:VEC_SYS_CALL
	jsr	>sys_call

	org	p:VEC_ABORT
	jsr	>dma_error		; Not supported in this example

	endif

	org	p:dma_s_saved_lc

	if !VIDMAIL_VERSION
;---------------------------------------------------------------------------
; sys_call - field a request from the kernel
;
; A "system call" is a host command followed by one int written to the DSP.
; In the future, the int may specify more ints to follow.
; All currently possible syscall bits are listed in 
; /usr/include/nextdev/snd_dsp.h (search for SYSCALL).
;
; arg = 24bits = (8,,16) = (op,datum)
; 	where op = 1 for read and 2 for write
;	and datum is currently not used.
;
sys_call
	jclr #m_hrdf,x:m_hsr,sys_call		;buzz until int received
	movep x:m_hrx,y:Y_SYSCALL		;int specifying operation
	set_status B_SYS_CALL			;set flag to say we got this
	if DEBUG_VERSION
	  jsr 	save_regs
	  move #>SC_W_REQ,X0
	  move	y:Y_SYSCALL,A 			;int specifying operation
	  cmp	X0,A
	  jsne	dma_error
	  test_status B_READ_FLOWING
	  jscs	dma_error
	  jsr restore_regs
	endif
	if !BUG56_VERSION
	  set_bit hrie,hcr 			; enable dma output flow
	endif
	if DEBUG_VERSION
	  set_status B_READ_FLOWING
	endif
	rti

	endif		; !SYNCHED_VERSION

;---------------------------------------------------------------------------
; Simple host-interface i/o
;
; "writeWordHost source" writes word in source to the host interface.
; "readWordHost dest" reads word in host interface to dest.
; These can only be used at the BEGINNING of the DSP program.
; After DMA transfers start, they cannot be used again.
;
writeWordHost macro source
	if !BUG56_VERSION
	  jclr #m_htde,x:m_hsr,*	; can't force short
	endif
        movep source,x:m_hrx
	endm	

readWordHost macro dest
	if !BUG56_VERSION
	  jclr #m_hrdf,x:m_hsr,*	; can't force short
	endif
        movep x:m_hrx,dest
	endm	

; DMA end-pointers
;
READ_END1	equ	READ_BUF2  ; End of first input buffer + 1
READ_END2	equ	READ_BUF1  ; End of second input buffer + 1 (modulo)
WRITE_END1	equ	WRITE_BUF2 ; End of first ouput buffer + 1
WRITE_END2	equ	WRITE_BUF1 ; End of 2nd out buf + 1 (modulo indexing)

check_reg macro r
	  if "r"=='R0'||"r"=='r0'
	    fail 'fast_dma_support.asm: Cannot use R0 for DMA reg'
	  endif
	endm

	check_reg R_DMA
	if !SYNCHED_VERSION
	  check_reg R_DMA_IN
	  check_reg R_DMA_OUT
	endif
;
; dma_start - executed when DSP boots up. Resets DMA and starts first read.
;
dma_start 
dma_reset					; synonym
	set_bit hcie,hcr 			; enable host commands
	if DMA_READ_SIZE==DMA_WRITE_SIZE
	  move #DMA_READ_SIZE-1,M_DMA		; Modulo addressing by default
	else
	  move #$FFFF,M_DMA			; Linear addressing by default
	endif
	if !SYNCHED_VERSION
	  move #(2*DMA_READ_SIZE-1),M_DMA_IN	; Modulo addressing for input
	  move #(2*DMA_WRITE_SIZE-1),M_DMA_OUT	; Modulo addressing for output
	endif
	move #0,X0
	move #Y_DMAQ_A0,R0
	rep #(Y_DMA_TOP-Y_DMAQ_A0+1)
	  move X0,y:(R0)+		; Clear DMA request Q, status, etc.
	if DEBUG_VERSION
	  move #READ_BUF1,R0
	  move #2*(DMA_READ_SIZE+DMA_WRITE_SIZE),N0
	  rep N0
	    move X0,y:(R0)+		; Clear DMA buffers
	  move #Y_DMAQ_LOG,R0
	  move R0,y:Y_DMAQ_LOG_PTR	; Initialize DMA request and DQ log
	endif
	;
	; Initialize DMA state variables.
	; Start first DMA read, and wait for it to finish.
	;
	move #READ_BUF2,R0		; trigger is always one buffer ahead
	move R0,y:Y_READ_TRIGGER	; init for enqueue_dma_read's sake
	if !SYNCHED_VERSION
	  move #READ_END2,X0		; actually beginning of read buffers
	  move X0,y:Y_LAST_READ_ADDRESS	; init for dma_read_complete's sake
	endif
	asm_lgoto jsr,enqueue_dma_read 	; prime input pipe
	;
	; Set up pointers for user i/o code
	;
	if !SYNCHED_VERSION
	  move #READ_BUF1,R_DMA_IN    	; init input pointer for first read
	  move #WRITE_BUF1,R_DMA_OUT    ; init output pointer for first write
	  move #WRITE_BUF2,X0		; ptr value when wbuf1 full
	  move X0,y:Y_WRITE_TRIGGER	;  installed as trigger
	  move #WRITE_END2,X0	  	; first buffer element we can't write
	  move X0,y:Y_LAST_WRITE_ADDRESS ;  installed for proper blocking
	endif
	if BUG56_VERSION
	  jsr do_dma_read ; Simulate host getting around to one DMA write
	endif
	rts

	if !SYNCHED_VERSION

; readWordDMA dest - returns the next sample from input stream in register A1
; and if dest!=A, moves A to dest.
; When input buffer is empty, blocks until DMA is successful. 
; Must be called once for each sample in a sample-frame.  
; On return, the carry bit is set if there is no more input, and
; A1 is undefined. If A1 is valid, carry will be clear.
;
; Registers A, B and X0 are modified.
; Assumes R_DMA_IN is initially set to point to input buffer.
;
readWordDMA macro dest
	asm_lgoto jsr,getWordDMA
	if "dest"!='A'
	  move A,dest
	endif
	endm

; readShortDMA dest - returns the next short from input stream in register A1
; and if dest!=A, moves A to dest. Assumes 16-bit DMA mode is in use
; which means the word needs to be left-shifted 8 bits in order to
; left-justify it.  (16-bit DMA uses RXM and RXL in the host interface.)
; Registers A, B, X0, and Y0 are modified.
; Otherwise readShortDMA is identical to readWordDMA.
;
readShortDMA macro dest
	asm_lgoto jsr,getWordDMA
	move A,X0 #>@pow(2,-16),Y0
	mpy X0,Y0,A
	move A0,A
	if "dest"!='A'
	  move A,dest
	endif
	endm

getWordDMA
	move R_DMA_IN,A			; address of next word to read
rwd_block
	move y:Y_LAST_READ_ADDRESS,X0	; first unavailable word
	cmp X0,A
	jne rwd_unblock
	jsclr_status B_DMA_ACTIVE,next_dma ; let any Q'd dma requests out
	if DEBUG_VERSION
	  set_status B_READ_BLOCK
	endif
	if BUG56_VERSION&&(!BUG56_BLOCK)
	  ; Blocking for a read... Clear it now
	  jsr do_dma_read ; Simulate host getting around to a DMA write
	else
	  asm_lgoto jmp,rwd_block ; block until dma_read_complete is called
	endif
rwd_unblock
	if DEBUG_VERSION
	  clear_status B_READ_BLOCK
	endif
	move y:Y_READ_TRIGGER,X0
	move R_DMA_IN,A
	cmp X0,A
	jseq enqueue_dma_read	; we depend on delay before host starts write:
	move y:(R_DMA_IN)+,A	; requested read (last wd of NQ'd read buffer)
	rts
;
; writeWordDMA src - puts sample in "src" to the output stream. 
; For fastest behavior, src should be A (sample in A1).
; A whole buffer is accumulated before requesting DMA. 
;
; Registers A, B and X0 are modified.
; Assumes R_DMA_OUT is initially set to point to output buffer.
;
writeWordDMA macro source
	if "source"!='A'
	  move source,A
	endif
	asm_lgoto jsr,putWordDMA
	endm

; writeShortDMA src - same as writeWordDMA but assumes short datum.
; For fastest behavior, src should be A (sample in A1).
; A whole buffer is accumulated before requesting DMA. 
;
; Registers A, B, X0, and Y0 are modified.
; Assumes R_DMA_OUT is initially set to point to output buffer.
;
writeShortDMA macro source
	if "source"!='A' ; It would be no faster and less uniform
	  move source,A  ; to request "X0" as the preferred argument.
	endif
	move A,X0 #>@pow(2,-16),Y0
	mpy X0,Y0,A
	move A0,A
	asm_lgoto jsr,putWordDMA
	endm

putWordDMA
	move A,y:(R_DMA_OUT)+		; requested write
	move R_DMA_OUT,B		; address of next word to read
	move y:Y_WRITE_TRIGGER,X0
	cmp X0,B
	jseq enqueue_dma_write
wwd_block
	move y:Y_LAST_WRITE_ADDRESS,X0	; first unavailable word
	cmp X0,B
	jne wwd_unblock
	if DEBUG_VERSION
	  set_status B_WRITE_BLOCK
	endif
	jsclr_status B_DMA_ACTIVE,next_dma ; let any Q'd dma requests out
	if BUG56_VERSION&&(!BUG56_BLOCK)
	  ; Blocking for a write... Clear it now
	  jseq do_dma_write ; Simulate host getting around to a DMA read
	else
	  asm_lgoto jmp,wwd_block ; block until dma_write_complete is called
	endif
wwd_unblock
	if DEBUG_VERSION
	  clear_status B_WRITE_BLOCK
	endif
	rts

	endif		; !SYNCHED_VERSION

; **************************** DMA QUEUE CODE *********************************

mask_host macro
	  ori #2,mr ; raise level to 2 (lock out host at level 1)
	  do #1,_loop
	   nop ; wait for pipeline to clear (need 8 cycles delay)
_loop
	  endm

unmask_host macro
	  andi #$FC,mr ; i1:i0 = 0
	  endm


update_write_trigger macro
	move y:Y_WRITE_TRIGGER,A
	move #WRITE_BUF1,X0
	cmp X0,A
	jne udwta_t
	move #WRITE_BUF2,X0
udwta_t move X0,y:Y_WRITE_TRIGGER
	endm

update_read_trigger macro	; returns read-trigger in X0 as side effect
	move y:Y_READ_TRIGGER,A
	move #READ_BUF1,X0
	cmp X0,A
	jne udrta_t
	move #READ_BUF2,X0
udrta_t move X0,y:Y_READ_TRIGGER
	endm

enqueue_dma_read			; CLOBBERS R0,A,X0
	update_read_trigger 		; toggle trigger address
	move X0,y:Y_DMA_ARG 		; argument to enqueue_dma
	bset #16,y:Y_DMA_ARG		; r/w~ bit
	asm_lgoto jsr,enqueue_dma
	rts

enqueue_dma_write			; CLOBBERS R0,A,X0
	update_write_trigger 		; toggle trigger address
	move y:Y_WRITE_TRIGGER,R0
	move R0,y:Y_DMA_ARG 		; argument to enqueue_dma
	bclr #16,y:Y_DMA_ARG		; r/w~ bit
	asm_lgoto jsr,enqueue_dma
	rts

; enqueue_dma - Place DMA descriptor into next free element of DMA Q.
;		DMA descriptor (direction,,address) passed in y:Y_DMA_ARG. 
;		Since interrupts are turned off, interrupt-level regs used.
;
enqueue_dma
	mask_host		; Act like an interrupt handler
	asm_lgoto jsr,save_regs	; We need R0.
	move #3,M0		; Q is length 4, modulo
	move y:Y_DMAQ_FREE,R0	; ptr to next free place in DMA Q
	move y:Y_DMA_ARG,X0	; direction,,address
	if DEBUG_VERSION
	  move y:(R0),A		; descriptor should be zero unless DMA Q full
	  tst A			; Blocking should prevent DMA Q from filling
	  jsne dma_error
	endif
	move X0,y:(R0)+		; enqueue dma descriptor
	move R0,y:Y_DMAQ_FREE	; advance free pointer
	jsclr_status B_DMA_ACTIVE,next_dma ; DMA restart (can't force short)
	asm_lgoto jsr,restore_regs
	unmask_host
	rts

	if DEBUG_VERSION
wrap_log_ptr	
; on entry, R0 points to word in log just written
; X0 and B clobbered
	  move #>Y_DMA_TOP,B
	  move R0,X0
	  cmp X0,B		; B-X0 = TOP-current
	  jge dd_cont
	    move #Y_DMAQ_LOG,R0
dd_cont	  move R0,y:Y_DMAQ_LOG_PTR ; update pointer
	  move M0,y:(R0)	; Flag where we are in log
 	  rts
	endif

dequeue_dma			; CALLED AT INTERRUPT LEVEL WITH REGS SAVED
	clr A y:Y_DMAQ_NEXT,R0	; active-DMA pointer
	move #3,M0		; DMA Q is modulo
	nop
	if DEBUG_VERSION
	  move y:(R0),A		; old DMA descriptor
	  move #-1,M0		; DMA Q is modulo
	  move y:Y_DMAQ_LOG_PTR,R0
	  nop
	  move R_DMA,y:(R0)+	; check up on final DMA transfer address
	  jsr wrap_log_ptr	; possibly flip R0 back to log start
	  move A,y:(R0)		; write dma descr for last xfer to log
	  bset #23,y:(R0)+	; flag this entry as a "dequeue"
	  jsr wrap_log_ptr	; possibly flip R0 back to log start
	  move y:Y_DMAQ_NEXT,R0	; restore R0
	  move #3,M0		; DMA Q is modulo
	  clr A
	endif	
	move A,y:(R0)+		; clear y:(y:Y_DMAQ_NEXT) to mark cell as done
	move R0,y:Y_DMAQ_NEXT	; advance "next" pointer
	clear_status B_DMA_ACTIVE ; next_dma insists on this condition
	asm_lgoto jsr,next_dma	 ; start next dma, if any
dd_nodq rts

next_dma			; CALLED AT INTERRUPT LEVEL WITH REGS SAVED
	; called only when DMA not active to start up a DMA
	; called by enqueue_dma, in which case Q will never be empty
	; called by dequeue_dma, in which case Q may be empty
	if DEBUG_VERSION
	  jsset_status B_DMA_ACTIVE,dma_error
	endif
	move y:Y_DMAQ_NEXT,R0	; address of next DMA descriptor
	move #$ffff,M0		; linear mode
	move y:(R0),A		; DMA descriptor
	tst A			; zero means
	jeq nd_stop		;   nothing to do
	if DEBUG_VERSION
	  move y:Y_DMAQ_LOG_PTR,R0
	  nop
	  move A,y:(R0)+	; ASSUMES M0 == -1 !
	  jsr wrap_log_ptr	; possibly flip R0 back to log start
	  move y:Y_DMAQ_NEXT,R0	; address of next DMA descriptor
	endif	
	move A1,R_DMA		; DMA start address, stripping r/w~ bit
	jset #16,y:(R0),nd_read ; test r/w~ bit (can't force short)
nd_write
	if VIDMAIL_VERSION
	  if !BUG56_VERSION
	    set_bit htie,hcr 			; Go
	  else
	    ; "jsr do_dma_write" somewhere at user level
	  endif
	else					; !VIDMAIL_VERSION
	  if DEBUG_VERSION
	    set_status B_AWAIT_NOT_HF1
	  endif
	  jset	#m_hf1,x:m_hsr,nd_write		; make sure HF1 is low
	  if DEBUG_VERSION
	    clear_status B_AWAIT_NOT_HF1
	  endif
	  jclr #m_htde,x:m_hsr,nd_write 	; wait until we can write host
	  movep	#DM_R_REQ,x:m_htx		; send "read request" DSP msg
	  if DEBUG_VERSION
	    set_status B_AWAIT_HF1
	  endif
	  if !BUG56_VERSION
nd_ahf1     jclr #m_hf1,x:m_hsr,nd_ahf1		; HF1 means DMA is set up to go
	  endif
	  if DEBUG_VERSION
	    clear_status B_AWAIT_HF1
	  endif
	  if !BUG56_VERSION
	    set_bit htie,hcr 			; Go
	  endif
	endif					; !VIDMAIL_VERSION
	if DEBUG_VERSION
	  set_status B_WRITE_FLOWING
	endif
	asm_lgoto jmp,nd_run

nd_read	
	if VIDMAIL_VERSION
	  if !BUG56_VERSION
	    set_bit hrie,hcr 	; enable dma input flow
	    if DEBUG_VERSION
	      set_status B_READ_FLOWING
	    endif
	  else
	    ; "jsr do_dma_read" somewhere at user level
	  endif
	else
	  jclr #m_htde,x:m_hsr,nd_read	 	; wait until we can write host
	  jset #m_dma,x:m_hsr,nd_read 		; wait until prev dma done
	  jset #m_hf1,x:m_hsr,nd_read 		; wait until prev dma done
	  movep	#DM_W_REQ,x:m_htx		; send "write request" DSP msg
	  ;* set_bit hrie,hcr 			; done when sys_call comes in
	endif					; !VIDMAIL_VERSION

nd_run	set_status B_DMA_ACTIVE			; DMA is "active"
	rts

nd_stop	clear_status B_DMA_ACTIVE ; Empty DMA Q => DMA stops

	rts

; *************************** INTERRUPT HANDLERS ******************************

saved_a2 dc 0
saved_a1 dc 0
saved_a0 dc 0
saved_x0 dc 0
saved_r0 dc 0
saved_m0 dc 0

save_regs
	if DEBUG_VERSION
	  jsset_status B_REGS_SAVED,dma_error
	endif
	move A2,p:saved_a2
	move A1,p:saved_a1
	move A0,p:saved_a0
	move X0,p:saved_x0
	move R0,p:saved_r0
	move M0,p:saved_m0
	if DEBUG_VERSION
	  set_status B_REGS_SAVED
	endif
	rts

restore_regs
	if DEBUG_VERSION
	  jsclr_status B_REGS_SAVED,dma_error
	endif
	move p:saved_a2,A2
	move p:saved_a1,A1
	move p:saved_a0,A0
	move p:saved_x0,X0
	move p:saved_r0,R0
	move p:saved_m0,M0
	if DEBUG_VERSION
	  clear_status B_REGS_SAVED
	endif
	rts

update_last_write_address macro	; CALLED AT INTERRUPT LEVEL WITH REGS SAVED
	if SYNCHED_VERSION
	; we don't do I/O blocking behind the scenes in SYNCHED mode
	else
	  move y:Y_LAST_WRITE_ADDRESS,A
	  move #WRITE_END1,X0
	  cmp X0,A
	  jne ulwa_t		; addresses toggle
	  move #WRITE_END2,X0
ulwa_t    move X0,y:Y_LAST_WRITE_ADDRESS
	endif
	endm

update_last_read_address macro	; CALLED AT INTERRUPT LEVEL WITH REGS SAVED
	if SYNCHED_VERSION
	; we don't do I/O blocking behind the scenes in SYNCHED mode
	else
	  move y:Y_LAST_READ_ADDRESS,A
	  move #READ_END1,X0
	  cmp X0,A
	  jne ulra_t		; addresses toggle
	  move #READ_END2,X0
ulra_t    move X0,y:Y_LAST_READ_ADDRESS
	endif
	endm

dma_write_complete
	asm_lgoto jsr,save_regs
	if DEBUG_VERSION
	    jsset_status B_READ_FLOWING,dma_error
	endif
	update_last_write_address 	; write buffer freed
	clear_bit htie,hcr   		; shouldn't matter
	if DEBUG_VERSION
	    clear_status B_WRITE_FLOWING
	endif
	asm_lgoto jsr,dequeue_dma	; remove dma ptr from Q
	asm_lgoto jsr,restore_regs
	rti

dma_read_complete
	asm_lgoto jsr,save_regs
	if DEBUG_VERSION
	    jsset_status B_WRITE_FLOWING,dma_error
	endif
	update_last_read_address 	; read buffer filled
	clear_bit hrie,hcr   		; shouldn't matter
	if DEBUG_VERSION
	  clear_status B_READ_FLOWING
	endif
	asm_lgoto jsr,dequeue_dma	; remove dma ptr from Q
	asm_lgoto jsr,restore_regs
	rti

dma_error
	set_status B_ERROR
	if BUG56_VERSION
		SWI
	else
		ori #2,mr ; raise level to 2 (lock out host interrupts)
		set_bit	hf3,hcr	; abort code = HF2 and HF3
		set_bit hf2,hcr
dma_abort	asm_lgoto jmp,dma_abort
	endif

	if BUG56_VERSION

do_dma_write ; CALL AT USER LEVEL
	if !BUG56_BLOCK
 	  do #DMA_WRITE_SIZE,ddw_wloop
ddw_wblock  jclr #m_htde,x:m_hsr,ddw_wblock	; Manually read words in Bug56
	     movep 	y:(R_DMA)+,x:m_htx	; DMA read from x data memory
ddw_wloop
	  jsr dma_write_complete		; Don't issue host command
	endif
	rts

do_dma_read ; CALL AT USER LEVEL
	if !BUG56_BLOCK
	  do #DMA_READ_SIZE,ddr_rloop
ddr_rblock  jclr #m_hrdf,x:m_hsr,ddr_rblock	; Manually feed words in Bug56
	    movep x:m_hrx,y:(R_DMA)+		; DMA write to external memory
ddr_rloop
	  jsr dma_read_complete			; Don't issue host command
	endif
	  rts

	endif					; BUG56_VERSION
