{ ==========================================================================
         Title: AsmCount
    Programmer: David Neal Dubois

   Description: Unit AsmCount provides objects for word counting.
                Three objects are provided: CountObj, which counts
                words in an ASCII file; SprintCountObj, which can count
                words in a file created by Borland's Sprint word
                processor; and AmiCountObj, designed to count words
                in AmiPro files.

                A word is defined as a sequence of non-separators. The
                non-separators are the letters, digits, and apostrophe.
                The counters for Sprint and AmiPro parse for control code
                sequences. A control code is treated as a separator.

                The formats in which Sprint and AmiPro store control codes
                were determined emperically by David Gerrold. Sprint
                brackets control codes by ASCII codes 15 and 14, (Ctrl-O and
                Ctrl-N). Control codes may be nested. AmiPro indicates the
                beginning of a control code by one of these two character
                sequences: '<@', '<+', or '<-'. The code is terminated by
                a '>'.

                To use the counters, first initialize a *CountObj using the
                Init constructor passing the file name as a parameter.
                Then call the Count method. When the count method returns
                the field WordCount will hold the number of words in the
                file. Example:

                    var
                      Counter : CountObj;
                    begin
                      with Counter do
                        begin
                          Init ( 'Test' );
                          Count;
                          writeln ( 'number of words: ', WordCount );
                          Done;
                        end;
                    end.

                The Count methods are written in assembly language for
                maximum speed. Counting the words in a one megabyte ASCII
                text file stored on RAM disk on a 386, 25 MHz machine takes
                about 1.5 seconds.

        Method: Count reads from the file in blocks of 65520 characters at
                a time. As each character is processed, Count looks checks
                a look-up table to determine whether the character is a
                separator. When a non-separator is found followed by a
                separator, a word is counted.
  ========================================================================== }

unit AsmCoun1;

interface

  { -----------------------------------------------------------------------
    BlockSize    - The maximum number of characters to be read from a file
                   at one time. This is set to 65520, the largest block
                   which can be allocated from the Turbo Pascal heap.

    Init/Done    - The Init constructor opens the file. The Done destructor
                   closes it.

    Count        - This method performs the actual word count.

    InitCount    - Used internally by Count. This method initializes the
                   look-up table, ensures the file is at its beginning
                   (in case Count was called twice), and allocates a block
                   of memory from the heap to act as a file buffer.

    FiniCount    - Used internally by Count. Disposes of the file buffer.

    ReadBlock    - Used by Count to read a block from the file. At this time
                   the ShowProgress method is called.

    ShowProgress - Gives the user some indication of the Count method's
                   progress. Here it simply prints a period as each block
                   is read. This could changed to produce a percentage-bar.
    ----------------------------------------------------------------------- }

  const
    BlockSize = 65520;
  type
    BlockType = array [ 1 .. BlockSize ] of char;
    BlockPtr  = ^ BlockType;

    CountPtr = ^ CountObj;
    CountObj = object
                 TheFile   : file;
                 WordCount : longint;

                 constructor Init ( FileName : string );
                 destructor  Done;

                 procedure Count;     virtual;
                 procedure InitCount; virtual;
                 procedure FiniCount; virtual;
                 procedure ReadBlock;
                 procedure ShowProgress;
               end;

    SprintCountPtr = ^ SprintCountObj;
    SprintCountObj = object ( CountObj )
                       procedure Count;     virtual;
                       procedure InitCount; virtual;
                     end;

    AmiCountPtr = ^ SprintCountObj;
    AmiCountObj = object ( CountObj )
                    procedure Count; virtual;
                  end;

implementation

  { -----------------------------------------------------------------------
    These global variables are used internally by the Count method. Storing
    these in the global data segment simplifies the assembly code which
    accesses them.

    Block   - A pointer to the file buffer allocated by InitCount.

    Table   - The look-up table used to determine whether a character
              is a separator.

    Actual  - When a block is read from a file, the actual number of bytes
              which were read.

    SaveBP  - SprintObj.Count uses the BP register internally. It stores
              the original value here for safe keeping.
    ----------------------------------------------------------------------- }

  var
    Block  : BlockPtr;
    Table  : array [ char ] of byte;
    Actual : word;
    SaveBP : word;

  { -----------------------------------------------------------------------
    ShowProgress - Let the user know that progress is being made.
    ----------------------------------------------------------------------- }

  procedure CountObj . ShowProgress;
  begin
   { write ( '.' ); } {Can't use this within a DLL!!! }
  end;

  { -----------------------------------------------------------------------
    Init - Open a binary file with a record size of 1 byte.
    ----------------------------------------------------------------------- }

  constructor CountObj . Init ( FileName : string );
  begin
    assign ( TheFile, FileName );
    reset  ( TheFile, 1 );
    WordCount := 0;
  end;

  { -----------------------------------------------------------------------
    Done - Close the file.
    ----------------------------------------------------------------------- }

  destructor CountObj . Done;
  begin
    close ( TheFile );
  end;

  { -----------------------------------------------------------------------
    InitCount - performs three tasks.

                [1] Ensure the file pointer is set to the beginning of the
                    file.

                [2] Allocate a file buffer from the heap.

                [3] Generate the look-up table used to determine whether
                    a character is a separator. A separator is indicated
                    by a one stored in the table, while a non-separator
                    is indicated by a zero.
    ----------------------------------------------------------------------- }

  procedure CountObj . InitCount;
  var
    C : char;
  begin
    seek ( TheFile, 0 );

    new ( Block );

    for C := #0 to #255 do
      if C in [ '''', '0' .. '9', 'A' .. 'Z', 'a' .. 'z' ] then
        Table [ C ] := 0
      else
        Table [ C ] := 1;
  end;

  { -----------------------------------------------------------------------
    FiniCount - Dispose of the file buffer.
    ----------------------------------------------------------------------- }

  procedure CountObj . FiniCount;
  begin
    dispose ( Block );
  end;

  { -----------------------------------------------------------------------
    ReadBlock - Read a block from the file into the buffer. Call
                ShowProgress so the user knows what's happening.
    ----------------------------------------------------------------------- }

  procedure CountObj . ReadBlock;
  begin
    ShowProgress;
    blockread ( TheFile, Block ^, BlockSize, Actual );
  end;

  { -----------------------------------------------------------------------
    Count - Count the words in an ASCII text file.

      Register usage: AL - indicates whether the current character is a
                           separator. One for separator, zero for
                           non-separator.
                      AH - indicates whether the previous character was a
                           separator.
                      BX - points to look-up table.
                      CX - number of characters left to be processed in
                           buffer.
                   DX:DI - 32-bit word count
                   ES:SI - points to next character to be processed.
    ----------------------------------------------------------------------- }

  procedure CountObj . Count;
  var
    TempCount : longint;
  begin
    InitCount;

    asm
      cld                      { Clear direction flag. }

      xor   DX, DX             { Set word count to zero. }
      mov   DI, DX

      mov   AX, 0101h          { Treat beginning of file as separators.   }

      call  @CallReadBlock     { Fill file buffer.                        }

    @ProcessNormalChar:
      mov   AH, AL             { Set last character separator flag.       }
      seges lodsb              { Read a character from the buffer.        }
      xlat                     { Check look-up table.                     }
      cmp   AX, 0001h          { AX is 1 if this character is a separator }
                               {   and the previous character isn't.      }
      jz    @CountWord         { If this is the case, count a word.       }
      loop  @ProcessNormalChar { Process the next character.              }
      call  @CallReadBlock     { If we've processed the whole buffer then }
      jmp   @ProcessNormalChar {   fill it again.                         }

    @CountWord:
      add   DI, 1              { Increment the word counter.              }
      adc   DX, 0
      loop  @ProcessNormalChar { Process the next character.              }
      call  @CallReadBlock
      jmp   @ProcessNormalChar

    @CallReadBlock:
      pushf                    { Before calling the ReadBlock method,     }
      push  AX                 {   save current state of registers.       }
      push  DX
      push  DI
    end; { asm }

    ReadBlock;                 { Read a block of characters into buffer.  }

    asm
      pop   DI                 { Restore registers.                       }
      pop   DX
      pop   AX
      popf

      mov   BX, offset Table   { Set BX to point to look-up table.        }
      les   SI, [ Block ]      { Load address of file buffer into ES:SI.  }
      mov   CX, [ Actual ]     { Find out number of characters in buffer. }
      jcxz  @EndOfFile         { If there are no characters in buffer,    }
                               {   then the end of file has been reached. }
      retn                     { Return from CallReadBlock.               }

    @EndOfFile:
      add   SP, 2              { Pop and ignore CallReadBlock's return    }
                               {   address.                               }

      cmp   AL, 0              { Special consideration must be given to   }
      jnz   @Fini              {   the end-of-file. If the last character }
      add   DI, 1              {   processed was not a separator, count   }
      adc   DX, 0              {   a word.                                }

    @Fini:
      mov   [ word ptr TempCount     ], DI  { Store the result in         }
      mov   [ word ptr TempCount + 2 ], DX  {   TempCount.                }
    end; { asm }

    FiniCount;                 { Dispose of file buffer.                  }
    WordCount := TempCount;    { Return word count.                       }
  end;

  { -----------------------------------------------------------------------
    The Sprint counter is similar to the ASCII counter, except that it
    parses for Sprint control codes. Sprint brackets control codes by
    Ctrl-O and Ctrl-N. These control codes may be nested. Processing
    proceeds identically to that of the ASCII counter, until a Ctrl-O is
    found.

    When a control code sequence is found, the Sprint counter enters
    another loop. The loop processes characters, keeping count of the
    depth of control codes, adding one each time a ^O is encountered,
    and subtracting one each time a ^N is found. When the count reaches
    zero, the control code is terminated and control returns to normal
    word-counting loop.
    ----------------------------------------------------------------------- }

  { -----------------------------------------------------------------------
    InitCount - The Sprint counter's InitCount performs the same functions
                as the ASCII InitCount, except that a special value is
                placed in the look-up table for Ctrl-O to indicate the
                start of control-code processing.
    ----------------------------------------------------------------------- }

  procedure SprintCountObj . InitCount;
  begin
    CountObj . InitCount;
    Table [ ^O ] := $FF;
  end;

  { -----------------------------------------------------------------------
    Count - Count the words in a Sprint file. Comments annotate differences
            between this and CountObj.Count.

    There are two pieces of code in this procedure which are kind of
    tricky. These were designed to speed and simplify processing, but
    they'll require some explanation.

     ------------------------------------------------------------------
       Previous char   Current char    AH   AL    Need to do
     ------------------------------------------------------------------
         separator       separator     01   01    nothing
         separator     non-separator   01   00    nothing
         separator        Ctrl-O       01   FF    process control code
       non-separator     separator     00   01    count a word
       non-separator   non-separator   00   00    nothing
       non-separator      Ctrl-O       00   FF    process control code
     ------------------------------------------------------------------

    This table shows how the main loop must handle different
    situations that will occur. The focus of this code is to maximum
    speed. Therefore, the codes for the table were carefully chosen so
    that the more common "do nothing" situations can be distinguighed
    from the others with a single comparison. Once it has been
    determined that something must be done, it can be determined without
    another comparison.

                   cmp AL, AH
                   ja  @CountOrControlCode
                   <...do-nothing processing...>

                 @CountOrControlCode:
                   js  <...control-code processing...>
                   <...count a word...>

    The other bit of tricky code is in the control code sequence processing
    loop. It is necessary to add one to a counter if character number 15
    is found, and to subtract one if character 14 is found. Then, the
    loop must be terminated if the count has reached zero. This task is
    performed by the following code. (BP holds the depth counter.):

                   sub   AL, 14
                   sub   AL, 1
                   adc   BP, 0
                   sub   AL, 1
                   sbb   BP, 0
                   jz    <...loop complete...>

    ----------------------------------------------------------------------- }

  procedure SprintCountObj . Count;
  var
    TempCount : longint;
  begin
    InitCount;

    asm
      cld

      mov   [ SaveBP ], BP     { When it comes time to process a control  }
                               {   code sequence, the depth counter is    }
                               {   stored in BP. The original value of BP }
                               {   must be restored later when the method }
                               {   is complete, and also before ReadBlock }
                               {   is called.                             }

      xor   DX, DX
      mov   DI, DX
      mov   AX, 0101h
      call  @CallReadBlock

    @ProcessNormalChar:
      mov   AH, AL
      seges lodsb
      xlat
    @Continue:
      cmp   AL, AH               { See comments above.                    }
      ja    @CountOrControlCode

      loop  @ProcessNormalChar
      call  @CallReadBlock
      jmp   @ProcessNormalChar

    @CountOrControlCode:
      js    @ControlCode

      add   DI, 1                { Count a word.                          }
      adc   DX, 0
      loop  @ProcessNormalChar
      call  @CallReadBlock
      jmp   @ProcessNormalChar

    @ControlCode:
      mov   BP, 1                { Process a control code sequence. A     }
      jmp   @EndControlCodeLoop  {   control sequence begins with ^O and  }
                                 {   ends with ^N. Control codes may be   }
                                 {   nested. Therefore it is necessary to }
                                 {   keep a depth counter. As ^O's are    }
                                 {   processed, the counter will be       }
                                 {   incremented. ^N's will decrement the }
                                 {   counter. When the counter reaches    }
                                 {   zero, the control code is complete.  }
                                 {   The depth counter will be stored in  }
                                 {   BP. Since a ^O has already been      }
                                 {   found, the counter is set to 1.      }

    @ProcessControlCodeChar:
      seges lodsb                { Get next character.                    }

      sub   AL, 14               { See comments above.                    }
      sub   AL, 1
      adc   BP, 0
      sub   AL, 1
      sbb   BP, 0
      jz    @ControlCodeDone

    @EndControlCodeLoop:
      loop  @ProcessControlCodeChar  { Process next char. }
      mov   AL, AH                   { This line is necessary to ensure   }
                                     {   correct processing should the    }
                                     {   end-of-file already be reached.  }
      call  @CallReadBlock
      jmp   @ProcessControlCodeChar

    @ControlCodeDone:
       mov  AL, 1                { Return to normal processing as if a    }
       jmp  @Continue            {   a separator had been found.          }

    @CallReadBlock:
      pushf
      push  AX
      push  DX
      push  DI
      push  BP                   { Save the depth counter, and restore BP }
      mov   BP, [ SaveBP ]       {   will ReadBlock is called.            }
    end; { asm }

    ReadBlock;

    asm
      pop   BP
      pop   DI
      pop   DX
      pop   AX
      popf

      mov   BX, offset Table
      les   SI, [ Block ]
      mov   CX, [ Actual ]
      jcxz  @EndOfFile
      retn

    @EndOfFile:
      pop   CX
      cmp   AL, 1
      jz    @Fini
      add   DI, 1
      adc   DX, 0

    @Fini:
      mov   BP, [ SaveBP ]       { Restore original BP.                   }

      mov   [ word ptr TempCount     ], DI
      mov   [ word ptr TempCount + 2 ], DX
    end; { asm }

    FiniCount;
    WordCount := TempCount;
  end;

  { -----------------------------------------------------------------------
    AmiPro control code sequences begin with '<', followed by one of '@',
    '+' or '-'. The sequence terminates with a '>'. AmiCountObj.Count
    processes characters the same as CountObj.Count except that it checks
    each character to see if it is a '<'. When a '<' is encountered,
    another character is read and checked against '@', '+' and '-'. If
    one of these is found, Count enters a loop looking for '>'.
    ----------------------------------------------------------------------- }

  procedure AmiCountObj . Count;
  var
    TempCount : longint;
  begin
    InitCount;

    asm
      cld

      xor   DX, DX
      mov   DI, DX

      mov   AX, 0101h

      call  @CallReadBlock

    @ProcessNormalChar:
      mov   AH, AL
      seges lodsb

      cmp   AL, '<'              { Check to see if character is '<'. If   }
      jz    @PerhapsControlCode  {  it is, a control code sequence may be }
                                 {  beginning.                            }

    @Continue:
      xlat
      cmp   AX, 0001h
      jz    @CountWord
      loop  @ProcessNormalChar
      call  @CallReadBlock
      jmp   @ProcessNormalChar

    @CountWord:
      add   DI, 1
      adc   DX, 0
      loop  @ProcessNormalChar
      call  @CallReadBlock
      jmp   @ProcessNormalChar

    @ProcessCharAfterLT:
      seges lodsb                { Read the character that follows '<'.   }
      cmp   AL, '@'              { Check to see if it is '@', '+' or '-'. }
      jz    @ProcessControlCode
      cmp   AL, '+'
      jz    @ProcessControlCode
      cmp   AL, '-'
      jz    @ProcessControlCode
      jmp   @Continue

    @PerhapsControlCode:
      loop  @ProcessCharAfterLT
      mov   AL, AH
      call  @CallReadBlock
      jmp   @ProcessCharAfterLT

    @ProcessControlCodeChar:
      seges lodsb                { A control code sequence has begun.     }
      cmp   AL, '>'              { Process characters until a '>' is      }
      jz    @Continue            {   found.                               }

    @ProcessControlCode:
      loop  @ProcessControlCodeChar
      mov   AL, AH
      call  @CallReadBlock
      jmp   @ProcessControlCodeChar

    @CallReadBlock:
      pushf
      push  AX
      push  DX
      push  DI
    end; { asm }

    ReadBlock;

    asm
      pop   DI
      pop   DX
      pop   AX
      popf

      mov   BX, offset Table
      les   SI, [ Block ]
      mov   CX, [ Actual ]
      jcxz  @EndOfFile
      retn

    @EndOfFile:
      add   SP, 2
      cmp   AL, 0
      jnz   @Fini
      add   DI, 1
      adc   DX, 0

    @Fini:
      mov   [ word ptr TempCount     ], DI
      mov   [ word ptr TempCount + 2 ], DX
    end; { asm }

    FiniCount;
    WordCount := TempCount;
  end;

{begin}
end.