     page 132,60,1,1
;*******************************************
;Motorola Austin DSP Operation  June 30,1988
;*******************************************
;DSP960002
;Peripheral to Memory FFT - 256 point
;File name: E2-96.asm
;**************************************************************************
;    Maximum sample rate:  321.5 us at 27.0 MHz
;    Memory Size: Prog:  165 words ; Data:  1794 words
;    Number of clock cycles:   8680 (4340 instruction cycles)
;    Clock Frequency:    27.0MHz
;    Instruction cycle time:  74.1ns
;**************************************************************************
;
; Complex, Radix 2 Cooley-Tukey Decimation in Time FFT
; Untested
;
; Faster FFT using Programming Tricks found in Typical FORTRAN Libraries
;
;      First two passes combined as a four butterfly loop since
;            multiplies are trivial.
;            2.25 cycles internal (4 cycles external) per Radix 2 
;            butterfly.
;      Middle passes performed with traditional, triple-nested DO loop.
;            4 cycles internal (8 cycles external) per Radix 2 butterfly
;            plus overhead.  Note that a new pipelining technique is 
;            being used to minimize overhead.
;      Next to last pass performed with double butterfly loop.
;            4.5 cycles internal (8.5 cycles external) per Radix 2
;            butterfly.
;      Last pass has separate single butterfly loop.
;            5 cycles internal (9 cycles external) per Radix 2 
;            butterfly.
;
;      For 256 complex points, average Radix 2 butterfly = 3.8 cycles
;      internal and 7.35 cycles external, assuming a single external
;      data bus.
;
;      Because of separate passes, minimum of 32 points using these
;      optimizations.  Approximately 150 program words required. 
;      Uses internal X and Y Data ROMs for twiddle factor coefficients
;      for any size FFT up to 1024 complex points.
;                           
; First two passes
;
;      9 cycles internal, 1.77X faster than 4 cycle Radix 2 bfy
;      16 cycles external, 2.0X faster than 4 cycle Radix 2 bfy
;
;      r0 = a pointer in & out
;      r6 = a pointer in
;      r4 = b pointer in & out
;      r1 = c pointer in & out
;      r5 = d pointer in & out
;      n5 = 2
;
;      normally ordered input data
;      normally ordered output data
;
;    dma set-up
;
     move #data,x:ptr1             ;set up buffer pointers
     move #data+points,x:ptr2      ;
     move #ad,x:$ffffffde          ;a/d memory location into dma source reg.
     move #0,x:$ffffffdd           ;no offset for source reg.
     move #2*points-1,x:$ffffffdb  ;modulo 2*points for input data collection
     move #1,x:$ffffffd9           ;offset = 1 for data storage
     move #data,x:$ffffffda        ;input data base address
     move #points,x:$ffffffdc      ;dma counter = 1024
     move #$80008035,x:$ffffffd8   ;start dma in block mode
;
;    test if dma ready, if so switch buffer pointers
;
strt jclr #28,x:$ffffffd8,strt     ;wait until dma is done
     move #points,x:$ffffffdc      ;if so, reinitialize dma counter
     move #$80008035,x:$ffffffd8   ;restart dma,
     move ptr1,d1.l                ;and swap buffer pointers
     move ptr2,d1.m                ;
     move d1.l,ptr2                ;
     move d1.m,ptr1                ;
;
;    FFT routine:
      move      #points,d1.l
      move      #passes,d9.l
      move      ptr2,d0.l
      move      #coef,m2
      move      #coefsize,d2.l

      lsr      d1         d0.l,r0
      lsr      d1         r0,r2
      add      d1,d0      d1.l,d8.l
      add      d1,d0      d0.l,r4
      add      d1,d0      d0.l,r1
      lsr      d2         d0.l,r5
      lsr      d2         r0,r6
      move      #2,n5
      move      d2.l,n6
      move      #-1,m0
      move      m0,m1
      move      m0,m4
      move      m0,m5
      move      m0,m6

      fmove                             x:(r0),d1.s
      fmove                             x:(r1),d0.s             
      fmove                             x:(r5)-,d2.s            
      fmove                                          y:(r5)+,d4.s
      faddsub  d1,d0                    x:(r4),d5.s               
      faddsub  d5,d2                                 y:(r4),d7.s  
;
;      Combine first two passes with trivial multiplies.
;
      do      d8.l,_twopass

      faddsubr d0,d2                                   y:(r5),d6.s  
      faddsub  d7,d6                     d2.s,x:(r0)+  y:(r6)+,d3.s 
      faddsubr d1,d7                     d0.s,x:(r4)   y:(r1)+,d2.s 
      faddsub  d3,d2                     d1.s,x:(r5)-             
      faddsubr d2,d6                     x:(r0)-,d1.s  d4.s,y:(r5)+n5
      faddsubr d3,d5                     x:(r1)-,d0.s  d2.s,y:(r4)+  
      faddsub  d1,d0                     x:(r5),d2.s   d6.s,y:(r0)+  
      ftfr     d5,d4                     x:(r4),d5.s   d3.s,y:(r1)   
      faddsub  d5,d2                     d7.s,x:(r1)+  y:(r4),d7.s    
_twopass
      fmove                                            d4.s,y:(r5)+   
;
; Middle passes
;
      tfr      d9,d3      #4,d0.l
      clr      d2         d8.l,d1.l
      sub      d0,d3      d2.l,m6
      do      d3.l,_end_pass
      move    d0.l,n2
      move    r2,r0
      lsr     d1      m2,r6
      dec     d1      d1.l,n0
      dec     d1      d1.l,n1
      move    d1.l,n3
      move    n0,n4
      move    n0,n5
      lea     (r0)+n0,r1
      move    r0,r4
      move    r1,r5
      fmove                             x:(r6)+n6,d9.s y:(),d8.s
      fmove                                            y:(r1),d7.s
      fmpy    d8,d7,d3                  x:(r1)+,d6.s
      fmpy    d9,d6,d0
      fmpy    d9,d7,d1                                 y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+,d6.s

      do      n2,_end_grp

      do      n3,_end_bfy
      fmpy    d9,d6,d0  fsub     d1,d2  d0.s,x:(r4)    y:(r0)+,d5.s
      fmpy    d9,d7,d1  faddsubr d5,d2  d4.s,x:(r5)    y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s    d2.s,y:(r5)+
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+,d6.s   d5.s,y:(r4)+
_end_bfy
      move      (r1)+n1
      fmpy    d9,d6,d0  fsub     d1,d2  d0.s,x:(r4)    y:(r0)+,d5.s
      fmpy    d9,d7,d1  faddsubr d5,d2  d4.s,x:(r5)    y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s    d2.s,y:(r5)+
      fmove                             x:(r6)+n6,d9.s y:(),d8.s
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+,d6.s   d5.s,y:(r4)+
      fmpy    d9,d6,d0  fsub     d1,d2  d0.s,x:(r4)    y:(r0)+n0,d5.s
      fmpy    d9,d7,d1  faddsubr d5,d2  d4.s,x:(r5)    y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s    d2.s,y:(r5)+n5
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+,d6.s   d5.s,y:(r4)+n4
_end_grp
      move     n2,d0.l
      lsl      d0      n0,d1.l
_end_pass
;
; next to last pass
;
      move      d0.l,n2
      move      r2,r0
      move      r0,r4
      lea      (r0)+2,r1
      move      r1,r5
      move      m2,r6
      move      #3,n0
      move      n0,n1
      move      n0,n4
      move      n0,n5
      fmove                             x:(r6)+n6,d9.s  y:(),d8.s
      fmove                                             y:(r1),d7.s
      fmpy    d8,d7,d3                  x:(r1)+,d6.s
      fmpy    d9,d6,d0
      fmpy    d9,d7,d1                                  y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+n1,d6.s

      do      n2,_end_next
      fmpy    d9,d6,d0  fsub     d1,d2  d0.s,x:(r4)     y:(r0)+,d5.s
      fmpy    d9,d7,d1  faddsubr d5,d2  d4.s,x:(r5)     y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s     d2.s,y:(r5)+
      fmove                             x:(r6)+n6,d9.s  y:(),d8.s
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+,d6.s    d5.s,y:(r4)+
      fmpy    d9,d6,d0  fsub     d1,d2  d0.s,x:(r4)     y:(r0)+n0,d5.s
      fmpy    d9,d7,d1  faddsubr d5,d2  d4.s,x:(r5)     y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s     d2.s,y:(r5)+n5
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+n1,d6.s  d5.s,y:(r4)+n4
_end_next
;
; last pass
;
      move      n2,d0.l       ;# previous groups ->d0
      lsl       d0      r2,r0 ;update # groups, r0 points to input data A
      move      d0.l,n2       ;# stages ->n2
      move      #odata,r4     ;r4 points to A output
      lea      (r0)+,r1         ;r1 points to B input
      move      r4,r2         ;r2 points to A output                 
      move      m2,r6           ;r6 points to twiddle factors  
      move      #2,n0         ;offset is 2 for A input pointer
      move      n0,n1         ;offset is 2 for B input pointer
      lea (r2)+n2,r5     ;r5 points to B output
      move     #points/4,n4   ;offset is #points/4 for A output pointer
      move      n4,n5         ;offset is #points/4 for B output pointer
      move     #0,m4          ;bit reversed addressing for A output pointer
      move     m4,m5          ;bit reversed addressing for B output pointer

      fmove                             x:(r6)+n6,d9.s  y:(),d8.s
      fmove                                             y:(r1),d7.s
      fmpy    d8,d7,d3                  x:(r1)+n1,d6.s
      fmpy    d9,d6,d0
      fmpy    d9,d7,d1                                  y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s
      fmove                             x:(r6)+n6,d9.s  y:(),d8.s
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+n1,d6.s

      do      n2,_end_last
      fmpy    d9,d6,d0  fsub     d1,d2  d0.s,x:(r4)     y:(r0)+n0,d5.s
      fmpy    d9,d7,d1  faddsubr d5,d2  d4.s,x:(r5)     y:(r1),d7.s
      fmpy    d8,d6,d2  fadd     d3,d0  x:(r0),d4.s     d2.s,y:(r5)+n5
      fmove                             x:(r6)+n6,d9.s  y:(),d8.s
      fmpy    d8,d7,d3  faddsubr d4,d0  x:(r1)+n1,d6.s  d5.s,y:(r4)+n4
_end_last

     jmp  strt           ;go back and start new fft
