'Experimental LZW Compressor for QuickBASIC 4.5
'By Rich Geldreich 1992
'This program is in the public domain: use as you wish!
'(QB4.5 users: Use search & replace to change all of the "SSEG" strings
'to "VARSEG" strings in this program.)
'Please see QBLZW.BAS for more information on LZW compression in QB.

'If you have and questions or problems, write/call:
'
'Rich Geldreich
'410 Market St.
'Gloucester City, NJ 08030
'(609)-742-8752
'
'Benchmarks:      ORIGINAL  HUFFMAN2.BAS  C1.BAS  ZIP
'BCL71ENR.LIB     263245    216495        191799  159324
'BIG_1_3.PCX      7401      3926          2735    2374
'MESSAGES.TXT     226989    151750        113077  84044
'TIME.MOD         155394    102447        87460   75101
'
'
'
' Do not press ctrl+break while this program is compressing! The string
' pointers may change, which may result in an error!

DEFINT A-Z
DECLARE SUB PutByte (A)
DECLARE SUB PutCode (A)
DECLARE SUB Rebuild.Table (New.Entries)
DECLARE FUNCTION GetByte ()
DECLARE SUB Hash (Prefix, Suffix, Index, Found)

CONST True = -1, False = 0

DIM SHARED Prefix(6576), Suffix(6576), Code(6576)
DIM SHARED Used(4096)

DIM SHARED InBuffer$, IAddress, IEndAddress, Iseg
DIM SHARED OutBuffer$, OStartAddress, OAddress, OEndAddress, Oseg

DIM SHARED CodeSize, CurrentBit, Char&
DIM SHARED Shift(12) AS LONG


FOR A = 0 TO 12: READ Shift(A): NEXT
DATA 1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192


LOCATE , , 1
IF POS(0) <> 1 THEN PRINT


InBuffer$ = STRING$(4000, 0)   'input buffer
OutBuffer$ = STRING$(4000, 0)  'output buffer


A& = SADD(OutBuffer$)
A& = A& - 65536 * (A& < 0)
Oseg = SSEG(OutBuffer$) + (A& \ 16)     'Segment of buffer
OAddress = (A& MOD 16)                  'Current address in disk buffer
OEndAddress = OAddress + 4000           'End address of  buffer
OStartAddress = OAddress                'Start of buffer

'Open input file
File$ = COMMAND$
IF File$ = "" THEN LINE INPUT "File to compress? "; File$: File$ = LTRIM$(RTRIM$(File$))
IF File$ = "" THEN END
OPEN File$ FOR BINARY AS #1
FileLength& = LOF(1)
'Is it there?
IF FileLength& = 0 THEN
    CLOSE #1
    KILL COMMAND$
    PRINT COMMAND$; " not found"
    END
END IF
'Open output file
OPEN "output.lzw" FOR BINARY AS #2
'Is it already there?
IF LOF(2) <> 0 THEN
    'Kill output file and reopen it
    CLOSE #2
    KILL "output.lzw"
    OPEN "output.lzw" FOR BINARY AS #2
END IF
'CurrentLoc& - position in input file
CurrentLoc& = 2

'Compression codes:
'Code 256 = end of file
'Code 257 = increase code size
'Code 258 = rebuild table
'Code 259 - 4095 = available for strings
StartCode = 259                 'First LZW code that is available
NextCode = 259
'The maximum code that can be represented in 9 bits
MaxCode = 512
'Start with 9 bit code size
CodeSize = 9
'Current bit position in Char& - use for PutCode
CurrentBit = 0
'Char& is a temporary buffer; accumulates codes from main program and
'puts them in the output file once complete bytes have been
'built
Char& = 0

GOSUB ClearTable
'Get first byte from file(it's a special case)
Prefix = GetByte

PRINT "LZW Compressor For QuickBASIC 4.5"
PRINT "By Richard Geldreich June 2nd, 1992"
PRINT "Compressing "; File$
PRINT : PRINT : PRINT
'First line to start updating statistics
Y = CSRLIN - 3
'Main compression loop
DO
    DO
     
        IF CurrentLoc& > FileLength& THEN
            PutCode Prefix
            PutCode 256
            PutCode 0: PutCode 0
            OutBuffer$ = LEFT$(OutBuffer$, OAddress - OStartAddress)
            LOCATE Y, 1
            PRINT "Bytes In:"; CurrentLoc&; (100& * CurrentLoc&) \ FileLength&; "%"
            BytesOut& = LOF(2) + (OAddress - OStartAddress)
            PRINT "Bytes Out:"; BytesOut&
            PRINT "Total Compression:"; 100 - ((100& * BytesOut&) \ CurrentLoc&); "%                         ";
            PUT #2, , OutBuffer$
            CLOSE
            END
        ELSE
            
            Suffix = GetByte
            CurrentLoc& = CurrentLoc& + 1
            'We now have a Prefix:Suffix to search for.
            'If the search fails, put the Prefix in the output file
            'and set the Prefix equal to the character which caused
            'the failure.

            Hash Prefix, Suffix, Index, Found
            IF Found = True THEN
                Prefix = Code(Index)
                'update how many times this string was used
                Used(Prefix) = Used(Prefix) + 1
            END IF
        END IF
    LOOP WHILE Found = True

    'only increase the code size when required
    DO WHILE Prefix >= MaxCode AND CodeSize < 12
        PutCode 257
        MaxCode = MaxCode * 2
        CodeSize = CodeSize + 1
    LOOP
   
    PutCode Prefix

    'Put the new string into the hash table.
    Prefix(Index) = Prefix
    Suffix(Index) = Suffix
    Code(Index) = NextCode  'remember this string's code

    'Prefix is now equal to the character that caused the failure now.
    Prefix = Suffix
 
    NextCode = NextCode + 1
    'if there are too many strings then rebuild the encoding table
    IF NextCode > 4096 THEN
           
        PutCode 258 'send rebuild table code to decompressor

        Rebuild.Table New.Entries
        NextCode = New.Entries + StartCode
       
        IF NextCode > 4096 THEN
            GOSUB ClearTable
            NextCode = StartCode        'reset NextCode to top of tree
        END IF

        CodeSize = 9
        MaxCode = 512

        
    END IF

    'let the impatient user know we haven't hung up (yet!)
    PrintCounter = PrintCounter + 1     'see if time to update the
    IF PrintCounter = 512 THEN          'screen
        LOCATE Y, 1
        PRINT "Bytes In:"; CurrentLoc&; (100& * CurrentLoc&) \ FileLength&; "%"
        BytesOut& = LOF(2) + (OAddress - OStartAddress)
        PRINT "Bytes Out:"; BytesOut&
        PRINT "Compression:"; 100 - ((100& * BytesOut&) \ CurrentLoc&); "%  "; "CodeSize:"; CodeSize; "NextCode:"; NextCode; "   ";
        PrintCounter = 0
    END IF
LOOP
'clears the hash table
ClearTable:
    FOR A = 0 TO 6576
        Prefix(A) = -1
        Suffix(A) = -1
        Code(A) = -1
    NEXT
RETURN

'Reads one byte from the input buffer, and fills the buffer if it's emty.
FUNCTION GetByte STATIC
    IF IAddress = IEndAddress THEN
        GET #1, , InBuffer$
        A& = SADD(InBuffer$)
        A& = A& - 65536 * (A& < 0)
        Iseg = SSEG(InBuffer$) + (A& \ 16)
        IAddress = (A& MOD 16)
        IEndAddress = IAddress + 4000
    END IF
    DEF SEG = Iseg
    GetByte = PEEK(IAddress)
    IAddress = IAddress + 1
END FUNCTION

'Attempts to finds a prefix:suffix string.
SUB Hash (Prefix, Suffix, Index, Found)
    
    Index = (Prefix * 256& XOR Suffix) MOD 6577 'XOR hashing
    IF Index = 0 THEN  'is Index lucky enough to be 0?
        Offset = 1     'Set offset to 1, because 6577-0=6577
    ELSE
        Offset = 6577 - Index
    END IF
    DO 'until we find a match or don't
        IF Code(Index) = -1 THEN      'is there nothing here?
            Found = False             'yup, not found
            EXIT SUB
        'is this entry what we're looking for?
        ELSEIF Prefix(Index) = Prefix AND Suffix(Index) = Suffix THEN
            Found = True              'yup, found
            EXIT SUB
        ELSE 'retry until we find what were looking for or we find a blank
             'entry
            Index = Index - Offset
            IF Index < 0 THEN 'is index too far down?
                Index = Index + 6577 'yup, bring it up then
            END IF
        END IF
    LOOP
END SUB

'Throws a byte into the output buffer and writes the buffer if it's full.
SUB PutByte (A) STATIC
    IF OAddress = OEndAddress THEN
        PUT #2, , OutBuffer$
        OAddress = OStartAddress
    END IF
    DEF SEG = Oseg
    POKE OAddress, A
    OAddress = OAddress + 1
END SUB

'Throws one multi-bit code to the output file.
SUB PutCode (A) STATIC
    SHARED MaxCode
    IF A >= MaxCode THEN STOP

    Char& = Char& + A * Shift(CurrentBit)
    CurrentBit = CurrentBit + CodeSize
    DO WHILE CurrentBit > 7
        PutByte Char& AND 255
        Char& = Char& \ 256
        CurrentBit = CurrentBit - 8
    LOOP
END SUB

'This is the "experimental" part of the program. This procedure eliminates
'any strings which are not used in the encoding table: the usual result of
'doing this is greater compression.
'It isn't documented well yet... I'm still working on it.
SUB Rebuild.Table (New.Entries)
    DIM P(4096), S(4096), U(4096) AS LONG, Pn(4096), C(4096)
    DIM Location(4096)
   
    SHARED StartCode, MaxCode, Prefix
    Num.Entries = 0
    
    FOR A = 0 TO 6576
        C = Code(A)
        IF C <> -1 THEN 'valid code?
            IF Used(C) > 0 THEN 'was it used at all?
                Used(C) = 0
                P = Prefix(A): S = Suffix(A)
                P(Num.Entries) = P          'put it into a temporary table
                S(Num.Entries) = S
                U(Num.Entries) = P * 4096& + S
                C(C) = Num.Entries
                Num.Entries = Num.Entries + 1
            END IF
        END IF
    NEXT
    

    Num.Entries = Num.Entries - 1
    FOR A = 0 TO Num.Entries
        Pn(A) = A
    NEXT
        'sort the table according to it's prefix:suffix
    Mid = Num.Entries \ 2
    DO
        FOR A = 0 TO Num.Entries - Mid
            IF U(Pn(A)) > U(Pn(A + Mid)) THEN
                SWAP Pn(A), Pn(A + Mid)
                Swap.Flag = True
                CompareLow = A - Mid
                CompareHigh = A
                DO WHILE CompareLow >= 0
                    IF U(Pn(CompareLow)) > U(Pn(CompareHigh)) THEN
                        SWAP Pn(CompareLow), Pn(CompareHigh)
                        CompareHigh = CompareLow
                        CompareLow = CompareLow - Mid
                    ELSE
                        EXIT DO
                    END IF
                LOOP
               
            END IF
        NEXT
        
        Mid = Mid \ 2
    LOOP WHILE Mid > 0
    
    
    FOR A = 0 TO Num.Entries
        Location(Pn(A)) = A
    NEXT
    'clear the old hash table
    FOR A = 0 TO 6576
        Prefix(A) = -1
        Suffix(A) = -1
        Code(A) = -1
    NEXT
    
    'put each prefix:suffix into the hash table
    FOR A1 = 0 TO Num.Entries
        A = Pn(A1)
       
        P = P(A)
        S = S(A)
        IF P >= StartCode THEN 'is it pointing twards a string?
            P = StartCode + Location(C(P)) 'yup; update the pointer
        END IF
        IF S >= StartCode THEN
            S = StartCode + Location(C(S))
        END IF
        'where does this prefix:suffix go?
        Hash P, S, Index, 0
        'put it there
        Prefix(Index) = P
        Suffix(Index) = S
        Code(Index) = A1 + StartCode
        
    NEXT
    '# of entries in the hash table now
    New.Entries = Num.Entries + 1
END SUB

