
                ;   BMOV(src, dst, len)
                ;
                ;   The memory move algorithm is somewhat more of a mess
                ;   since we must do it either ascending or decending.

                public  _bmov
_bmov:          move.l  4(sp),A0
                move.l  8(sp),A1
                move.l  12(sp),D0
                cmp.l   A0,A1           ;move to self
                beq     .bmend
                bls     .bmup
.bmdown         adda.l  D0,A0           ;descending copy
                adda.l  D0,A1
                move.w  A0,D1           ;CHECK WORD ALIGNED
                btst.l  #0,D1
                bne     .bmdown1
                move.w  A1,D1
                btst.l  #0,D1
                bne     .bmdown1
                cmp.l   #259,D0             ;chosen by calculation.
                blo     .bmdown8

                move.l  D0,D1               ;overhead for bmd44: ~360
                divu    #44,D1
                bvs     .bmdown8            ;too big (> 2,883,540)
                movem.l D2-D7/A2-A6,-(sp)   ;use D2-D7/A2-A6 (11 regs)
                move.l  #11*4,D0
                bra     .bmd44b
.bmd44a         sub.l   D0,A0               ;8          total 214/44bytes
                movem.l (A0),D2-D7/A2-A6    ;12 + 8*11  4.86 cycles/byte
                movem.l D2-D7/A2-A6,-(A1)   ; 8 + 8*11
.bmd44b         dbf     D1,.bmd44a          ;10
                swap    D1                  ;D0<15:7> already contain 0
                move.w  D1,D0               ;D0 = remainder
                movem.l (sp)+,D2-D7/A2-A6

.bmdown8        move.w  D0,D1               ;D1<2:0> = #bytes left later
                lsr.l   #3,D0               ;divide by 8
                bra     .bmd8b
.bmd8a          move.l  -(A0),-(A1)         ;20         total 50/8bytes
                move.l  -(A0),-(A1)         ;20         = 6.25 cycles/byte
.bmd8b          dbf     D0,.bmd8a           ;10
                sub.l   #$10000,D0
                bcc     .bmd8a
                move.w  D1,D0               ;D0 = 0 to 7 bytes
                and.l   #7,D0
                bne     .bmdown1
                rts

.bmd1a          move.b  -(A0),-(A1)         ;12         total 22/byte
.bmdown1                                    ;           = 22 cycles/byte
.bmd1b          dbf     D0,.bmd1a           ;10
                sub.l   #$10000,D0
                bcc     .bmd1a
                rts

.bmup           move.w  A0,D1               ;CHECK WORD ALIGNED
                btst.l  #0,D1
                bne     .bmup1
                move.w  A1,D1
                btst.l  #0,D1
                bne     .bmup1
                cmp.l   #259,D0             ;chosen by calculation
                blo     .bmup8

                move.l  D0,D1               ;overhead for bmu44: ~360
                divu    #44,D1
                bvs     .bmup8              ;too big (> 2,883,540)
                movem.l D2-D7/A2-A6,-(sp)   ;use D2-D7/A2-A6 (11 regs)
                move.l  #11*4,D0
                bra     .bmu44b
.bmu44a         movem.l (A0)+,D2-D7/A2-A6   ;12 + 8*11  ttl 214/44bytes
                movem.l D2-D7/A2-A6,(A1)    ;8  + 8*11  4.86 cycles/byte
                add.l   D0,A1               ;8
.bmu44b         dbf     D1,.bmu44a          ;10
                swap    D1                  ;D0<15:7> already contain 0
                move.w  D1,D0               ;D0 = remainder
                movem.l (sp)+,D2-D7/A2-A6

.bmup8          move.w  D0,D1               ;D1<2:0> = #bytes left later
                lsr.l   #3,D0               ;divide by 8
                bra     .bmu8b
.bmu8a          move.l  (A0)+,(A1)+         ;20         total 50/8bytes
                move.l  (A0)+,(A1)+         ;20         = 6.25 cycles/byte
.bmu8b          dbf     D0,.bmu8a           ;10
                sub.l   #$10000,D0
                bcc     .bmu8a
                move.w  D1,D0               ;D0 = 0 to 7 bytes
                and.l   #7,D0
                bne     .bmup1
                rts

.bmu1a          move.b  (A0)+,(A1)+
.bmup1
.bmu1b          dbf     D0,.bmu1a
                sub.l   #$10000,D0
                bcc     .bmu1a
.bmend          rts

