;
;  S I M I L F . A S M
;  =================
;  Entered & adapted by J Meeks (& Herb Martin a little); latest
;  version 05-09-89, 00:13am.
;
;  Adapted from Dr. Dobb's Journal, 07-88 issue, pp. 46-51 (text), pp.
;  68-72 (program listing).
;
;  Written by John W. Ratcliff & David E. Metzener, 11-10-87.
;
;  Uses the Ratcliff/Obershelp pattern recognition algorithm.
;
;  This program provides a new function for Turbo Pascal 5.0 on an
;  8086-based machine.  The function similf returns a percentage value
;  corresponding to the degree to which two strings are similar to one
;  another.  Be certain to convert both strings to uppercase if you
;  want to have a comparison that is case-insensitive.
;
;  NOTE: This routine is for use with Turbo Pascal 5.0 programs.  The
;  original simil is for use with C programs, and is almost identical
;  to this one.
;
         TITLE   SIMILF.ASM
;
         ASSUME  CS: Code, DS: Data, ES: Data
;
DATA     SEGMENT
;
ststr1l  dw      25 dup (?)      ; Lefts for string 1.
ststr1r  dw      25 dup (?)      ; Rights for string 1.
ststr2l  dw      25 dup (?)      ; Lefts for string 2.
ststr2r  dw      25 dup (?)      ; Rights for string 2.
stcknum  dw      ?               ; # elements on stack.
score    dw      ?               ; # chars in common * 2.
total    dw      ?               ; Total # chars in strings 1 & 2.
cl1      dw      ?               ; Left of string 1 found in common.
cr1      dw      ?               ; Right of string 1 found in common.
cl2      dw      ?               ; Left of string 2 found in common.
cr2      dw      ?               ; Right of string 2 found in common.
s2ed     dw      ?               ; End of string 2 used in compare.
;
DATA     ENDS
         PUBLIC similf
;
Code    SEGMENT
;
similf  PROC    NEAR
;
;  This routine expects pointers passed to two character strings, null
;  terminated, that you wish compared.  It returns a percentage value
;  from 0% to 100% corresponding to how alike the two strings are.
;
;  Usage:       Simil (Str1, Str2: string);
;
;  The similarity routine is composed of three major components:
;    compare   Finds the largest group of characters in common between
;                any two string sections.
;    pushst    Pushes a string section to be compared onto the stack.
;    popst     Pops a string section to be examined off the stack.
;
;  The similarity routine begins by computing the total length of both
;  strings passed and placing that value in total.  It then takes the
;  beginnings and endings of both strings passed and pushes them onto
;  the stack.  It then falls through into the main line code.

;  The original two strings are immediately popped off the stack and
;  are passed to the compare routine.  The compare routine finds the
;  largest group of characters in common between the two strings.

;  The number of characters in common is multiplied by two and added
;  to the total score.  If there were no characters in common, then
;  there is nothing to push onto the stack.  If there is exactly one
;  character to the left in both strings then we needn't push them
;  onto the stack (we already know they aren't equal from the previous
;  call to compare). Otherwise the characters on the left are pushed
;  onto the stack.  These same rules apply to characters to the right
;  of the substring found in in common.  This process of popping
;  substrings off the stack, comparing them, and pushing remaining
;  sections onto the stack is continued until the stack is empty.

;  On return, the total score is divided by the number of characters
;  in both strings.  This is multiplied by 100 to yield a percentage.
;  This percentage similarity is returned to the calling procedure.

         push    bp                ; Save BP.
         mov     bp, sp            ; Save SP in BP for use in program.
         push    es                ; Save ES.
         mov     ax, ds            ; Copy DS into ES.
         mov     es, ax
         xor     ax, ax            ; Clear AX.
         mov     score, ax         ; Clear score variable.
         mov     stcknum, ax       ; Clear number of stack entries.
         mov     si, [bp+4]        ; Move string 1 begin ptr to SI.
         mov     di, [bp+8]        ; Move string 2 begin ptr to DI.
         cmp     [si], al          ; Is string 1 a null string?
         je      strerr            ; Can't process null strings!
         cmp     [di], al          ; Is string 2 a null string?
         jne     docmp             ; Neither string null; process 'em.
strerr:  jmp     donit             ; Jump to exit routine.
docmp:   push    di                ; Save DI (because of SCAS).
         push    si                ; Save SI (because of SCAS).
         xor     al, al            ; Clr AL to search for string end.
         cld                       ; Clr direction flag to forward.
         mov     cx, -1            ; Repeat correct # times.
         repnz   scasb             ; Scan for delimiter in string 2.
         dec     di
         dec     di                ; Point DI to last char of str 2.
         mov     bp, di            ; Move DI to BP, where should be.
         pop     di                ; Original SI into DI for scasb.
         repnz   scasb             ; Scan for delimiter in string 1.
         not     cx                ; 1's comp of string 1's lentgh.
         sub     cx, 2             ; - 2 zero bytes at end string 1.
         mov     total, cx         ; Sum of strings' lengths in total.
         dec     di
         dec     di                ; Point DI to last char of str 1.
         mov     bx, di            ; Move DI to BX, where should be.
         pop     di                ; Restore original DI.
         call    pushst            ; Push for 1st call to compare.
main:    cmp     stcknum, 0        ; Is there anything on the stack?
         je      done              ; No.  Nothing to compare!
         call    popst             ; Set up regs for a compare call.
         call    compare           ; Do compare for substring set.
         cmp     dx, 0             ; Nothing in common to push.
         je      main              ; Try another set substrings.
         shl     dx, 1             ; Multiply # common chars by 2.
         add     score, dx         ; Add 2 * common chars to total.
         mov     bp, stcknum       ; Get # of entry to be looked at.
         shl     bp, 1             ; Ready AX to access string stk.
         mov     si, [ststr1l+bp]  ; Move l1 into SI (or l1).
         mov     bx, cl1           ; Move cl1 into BX (or r1).
         mov     di, [ststr2l+bp]  ; Move l2 into DI (or l2).
         mov     cx, cl2           ; Move cl2 into CX temporarily.
         mov     ax, [ststr1r+bp]  ; Get old r1 off stack.
         mov     cl1, ax           ; Place in cl1 temporarily.
         mov     ax, [ststr2r+bp]  ; Get old r2 off stack.
         mov     cl2, ax           ; Place in cl2 temporarily.
         mov     bp, cx            ; Move cl2 into BP.
         cmp     bx, si            ; Compare cl1 to l1.
         je      chrght            ; Nothing on left side of string 1.
         cmp     bp, di            ; Compare cl2 to l2.
         je      chrght            ; Nothing on left side of string 2.
         dec     bx                ; Point last part left side str 1.
         dec     bp                ; Point last part left side str 2.
         cmp     bx, si            ; Only one char to compare?
         jne     pushit            ; No.  We need to examine this.
         cmp     bp, di            ; Only one character in both?
         je      chrght            ; Only 1 char; nothing to look at.
pushit:  call    pushst            ; Push left side onto stack.
chrght:  mov     si, cr1           ; Move cr1 into SI (or l1).
         mov     bx, cl1           ; Move r1 into BX (or r1).
         mov     di, cr2           ; Move cr2 into DI (or l2).
         mov     bp, cl2           ; Move r2 into BP (or r2).
         cmp     si, bx            ; Compare cr1 to r1.
         je      main              ; Nothing on right side of str 1.
         cmp     di, bp            ; Compare cr2 to r2.
         je      main              ; Nothing on right side of str 2.
         inc     si                ; Point last part right side str 1.
         inc     di                ; Point last part right side str 2.
         cmp     bx, si            ; Only one character to examine?
         jne     push2             ; No.  Examine it.
         cmp     bp, di            ; Only one character in both?
         je      main              ; Yes.  Get next strs off stack.
push2:   call    pushst            ; Push right side onto stack.
         jmp short  main           ; Do next level of compares.
done:    mov     ax, score         ; Get score into AX for multiply.
         mov     cx, 100           ; Put 100 into CX for multiply.
         mul     cx                ; Multiply score by 100.
         mov     cx, total         ; Get total # chars for divide.
         div     cx                ; Div (score * 100) by # chars.
donit:   pop     es                ; Restore ES to entry value.
         POP     BP                ; Restore BP to entry value.
         ret     8                 ; Return; AX = % similarity.
similf  ENDP
;
;
compare  PROC    NEAR
;
;  The compare procedure locates the largest matching group of
;  characters between string 1 and string 2.  The routine assumes
;  the CPU's direction flag is cleared.
;
;  Pass to this routine:
;    DS:SI   l1 (left side of string 1),
;    BX      r1 (right side of string 1),
;    ES:DI   l2 (left side of string 2),
;    BP      r2 (right side of string 2).
;  The routine returns:
;    DX      # characters matching,
;    cl1     Left side of first matching substring,
;    cl2     Left side of second matching substring,
;    cr1     Right side of first matching substring,
;    cr2     Right side of second matching substring.
;
;  The compare routine is composed of two loops, an inner and an
;  outer. The worst case scenario is that there are absolutely no
;  characters in common between string 1 and string 2.  In this case
;  N * M compares are performed.  However, when a substring match
;  occurs in the inner loop, then the next character to be examined in
;  string 2 (for this loop) is advanced by the number of characters
;  found equal. Whenever a new maximum number of characters in common
;  is found then the ending locations of both the inner and outer
;  loops are backed off by the difference between the new max chars
;  value and the old max chars value; i.e., if 5 characters have been
;  found in common part of the way through the search then we can cut
;  our search short by 5 characters since there is no chance of
;  finding better than a 5-character match at that point.  This
;  technique means that an exact match will require only a single
;  compare, and combinations of other matches will proceed as
;  efficiently as possible.
;
;  Note that we branch downward for both newmax and newmax2, because
;  on the 8086 line of processors a branch not taken is much faster
;  than one that is taken.  Since the not-equal condition is to be
;  found most often, and we would like the inner loop to execute as
;  quickly as possible, we branch outside the loop's body on the less
;  frequent occurence.  When a match and/or a new maxchars is found
;  then we branch down to these two routines, process the new
;  conditions, then jump back up into the main line loop code.
         mov     s2ed, bp          ; Store end of string 2.
         xor     dx, dx            ; Initialize maxchars.
forl3:   push    di                ; Save start of string 2.
forl4:   push    di                ; Save start of string 2.
         push    si                ; Save start of string 1.
         mov     cx, s2ed          ; Set up to calc length of str 1.
         sub     cx, di            ; Get (length of string 1) - 1.
         inc     cx                ; Get length of string 1.
         push    cx                ; Save starting length of str 1.
         repz    cmpsb             ; Compare strings.
         jz      equal             ; If equal, skip fixes.
         inc     cx                ; Back up (CMPSB decs even if <>).
equal:   pop     ax                ; Get starting length of string 1.
         sub     ax, cx            ; Get length of common substring.
         jnz     newmax            ; More than zero chars matched.
         pop     si                ; Get back start of string 1.
         pop     di                ; Get back start of string 2.
reent:   inc     di                ; Do next char no matter what.
reent2:  cmp     di, bp            ; Are we done with string 2?
         jle     forl4             ; No.  Do next string compare.
         pop     di                ; Get back start of string 2.
         inc     si                ; Next char in string 1 to scan.
         cmp     si, bx            ; Are we done with string 1?
         jle     forl3             ; No.  Do next string compare.
         ret                       ; Return with maxchars in DX.
;
newmax:  cmp     ax, dx            ; Greater than maxchars?
         jg      newmx2            ; Yes.  Update new maxchars & ptrs.
         pop     si                ; Get back start of string 1.
         pop     di                ; Get back start of string 2.
         add     di, ax            ; Skip past matching chars.
         jmp short  reent2         ; Re-enter inner loop.
newmx2:  pop     si                ; Get back start of string 1.
         pop     di                ; Get back start of string 2.
         mov     cl1, si           ; Put beginning match of str 1.
         mov     cl2, di           ; Put beginning match of str 2.
         mov     cx, ax            ; Save new maxchars.
         sub     ax, dx            ; Get delta for str end adjustment.
         sub     bx, ax            ; Adjust end of string 1.
         sub     bp, ax            ; Adjust end of string 2.
         mov     dx, cx            ; New maxchars.
         dec     cx                ; Set up advance last match char.
         add     di, cx            ; Advance to last match char str 2.
         mov     cr2, di           ; Put ending of match of string 2.
         add     cx, si            ; Advance to last match char str 1.
         mov     cr1, cx           ; Put ending of match of string 1.
         jmp short  reent          ; Re-enter inner loop.
;
compare  ENDP
pushst   PROC    NEAR
;
;  Pass to this routine:
;    DS:SI   l1 (left side of string 1),
;    BX      r1 (right side of string 1),
;    ES:DI   l2 (left side of string 2),
;    BP      r2 (right side of string 2).
;
         mov     cx, bp            ; Save r2.
         mov     bp, stcknum       ; Get # entries on stack.
         shl     bp, 1             ; Multiply by 2 for words.
         mov     [bp+ststr1l], si  ; Put left side string 1 on stack.
         mov     [bp+ststr1r], bx  ; Put right side string 1 on stack.
         mov     [bp+ststr2l], di  ; Put left side string 2 on stack.
         mov     [bp+ststr2r], cx  ; Put right side string 2 on stack.
         inc     stcknum           ; Add 1 to number stack entries.
         mov     bp, cx            ; Restore r2.
         ret                       ; Return.
;
pushst   ENDP
;
;
popst    PROC    NEAR
;
;  Pass to this routine:
;    DS:SI   l1 (left side of string 1),
;    BX      r1 (right side of string 1),
;    ES:DI   l2 (left side of string 2),
;    BP      r2 (right side of string 2).
;
         dec     stcknum           ; Point to last entry in stack.
         mov     bp, stcknum       ; Get # entries on stack.
         shl     bp, 1             ; * 2 for words.
         mov     si, [bp+ststr1l]  ; Restore left string 1 from stk.
         mov     bx, [bp+ststr1r]  ; Restore right string 1 from stk.
         mov     di, [bp+ststr2l]  ; Restore left string 2 from stk.
         mov     bp, [bp+ststr2r]  ; Restore right string 2 from stk.
         ret                       ; Return.
;
popst    ENDP
;
;
Code     ENDS
;
         END    ; of file SimilF.Asm.
