	title	lzcomp - file compressor using limpel-ziv algorithm

;Tom Pfau
;Digital Equipment Corporation
;Parsippany, NJ
;
;v1.2, Toad Hall Tweak
; - Converting to COM file.
; - Gonna use BP to hold bit_offset throughout (faster)
; - Moved most buffers outside code space.
; - Now accepts input file on command line, otherwise inputs
;   from StdIn
; - Always outputs to StdOut
; Nuts .. StdIn doesn't seem to work .. donno why.
; Won't work as a filter either (which I had hoped for).
; WILL take cmd line file name, redirect it appropriately.
;
;David Kirschbaum
;Toad Hall
;kirsch@braggvax.ARPA

;Constants
CLEAR		equ	256		;Clear code
EOF		equ	257		;End of file marker
FIRST_FREE	equ	258		;First free code
MAXMAX		equ	4096		;Max code + 1
BUFFSIZE	EQU	4096		;TH read/write buffers
	include macros2.mlb

;Hash table entry
hash_rec	struc
first	dw	?			; First entry with this value
next	dw	?			; Next entry along chain
char	db	?			; Suffix char
hash_rec	ends

;Declare Segments
Code	segment para public 'code'
	assume	CS:Code, DS:Code, ES:Code

	org	100H

LzComp	proc	near
	jmp	Start

;TH collecting all data segment stuff
input_handle	dw	0		;default StdIn
prefix_code	dw	0	;?
free_code	dw	0	;?
max_code	dw	0	;?
nbits		dw	0	;?
k		db	0	;?

;bit_offset	dw	0	;?
input_offset	dw	0
input_size	dw	0
LzComp	endp


start	proc	near
;TH we won't mess with any memory allocating, since I think
;all required buffers, hash tables, etc. can work within
;a lousy 64Kb segment.
;Won't even bother moving the stackpointer for now either.

	mov	si,80H				;point to PSP cmd line
	lodsb					;snarf cmd line len byte
	or	al,al				;anything on cmd line?
	jz	Start_Process			;nope, use < stdin

	xor	ah,ah				;clear msb
;SI now points to 81H
	mov	dx,si				;not quite first char
	inc	dx				;DX points to target filename
	add	si,ax				;point to beyond last char
	mov	[si],ah				;make file name AsciiZ
	mov	ax,3D00H			;open file/device
	int	21H
	jb	Terminate			;failed somehow, die

	mov	input_handle,ax			;save input handle

Start_Process:
	call	compress		;Compress file
Terminate:
	mov	ah,4CH			;terminate, errorlevel in AL
	int	21H
start	endp

compress	proc	near

l1:	call	init_table		;Initialize the table and some vars
	mov	ax,CLEAR		;Write a clear code
	call	write_code
	call	read_char		;Read first char

l4:	xor	ah,ah			;Turn char into code
l4a:	mov	prefix_code,ax		;Set prefix code
	call	read_char		;Read next char
	jc	l17			;Carry means EOF
	mov	k,al			;Save char in k
	mov	bx,prefix_code		;Get prefix code
	call	lookup_code		;See if this pair in table
	jnc	l4a			;nc means yes, new code in ax
	call	add_code		;Add pair to table
	push	bx			;Save new code
	mov	ax,prefix_code		;Write old prefix code
	call	write_code
	pop	bx
	mov	al,k			;Get last char
	cmp	bx,max_code		;Exceed code size?
	jl	l4			;less means no
	cmp	nbits,12		;Currently less than 12 bits?
	jl	l14			;yes
	 mov	ax,CLEAR		;Write a clear code
	 call	write_code
	 call	init_table		;Reinit table
	 mov	al,k			;get last char
	 jmp	l4			;Start over

l14:	inc	nbits			;Increase number of bits
	shl	max_code,1		;Double max code size
	jmp	l4			;Get next char

l17:	mov	ax,prefix_code		;Write last code
	call	write_code
	mov	ax,EOF			;Write EOF code
	call	write_code
	mov	ax,bp	;bit_offset	;Make sure buffer is flushed to file
	or	ax,ax			;TH
	je	l18
	 mov	cx,8			;convert bits to bytes
	 xor	dx,dx
	 div	cx
	 or	dx,dx			;If extra bits, make sure they get
	 je	l17a			;written
	  inc	ax
l17a:	 call	flush
l18:	ret
compress	endp

init_table	proc	near
	mov	nbits,9			;Set code size to 9
	mov	max_code,512		;Set max code to 512

	mov	ax,-1			;Unused flag
	mov	cx,640			;Clear first 256 entries
	mov	di,offset hash		;TH Point to first entry
rep	stosw				;Clear it out
	mov	free_code,FIRST_FREE	;Set next code to use
	ret				;done
init_table	endp

write_code	proc	near
	push	ax			;Save code
	mov	ax,bp	;bit_offset	;Get bit offset
;TH we're keeping bit_offset in BP
;	mov	cx,nbits		;Adjust bit offset by code size
;	add	bit_offset,cx
	add	bp,nbits		;TH adjust bit offset by code size
	mov	cx,8			;Convert bit offset to byte offset
	xor	dx,dx
	div	cx
;	cmp	ax,1020			;Approaching end of buffer?
	cmp	ax,BUFFSIZE-4		;TH approaching end of buffer?
	jl	wc1			;less means no
	 call	flush			;Write the buffer
;TH we're keeping bit_offset in BP
;	 push	dx			;dx contains offset within byte
;	 add	dx,nbits		;adjust by code size
;	 mov	bit_offset,dx		;new bit offset
;	 pop	dx			;restore dx
	 mov	bp,dx			;TH dx contains offset within byte
	 add	bp,nbits		;TH adjust by code size

	 add	ax,offset output_data	;Point to last byte
	 mov	si,ax			;put in si
	 mov	al,[si]			;move byte to first position
	 mov	byte ptr output_data,al
	 xor	ax,ax			;Byte offset of zero
wc1:	add	ax,offset output_data	;Point into buffer
	mov	di,ax			;Destination
	pop	ax			;Restore code
	mov	cx,dx			;offset within byte
	xor	dx,dx			;dx will catch bits rotated out
	jcxz	wc3			;If offset in byte is zero, skip shift
wc2:	 shl	ax,1			;Rotate code
	 rcl	dx,1
	 loop	wc2
	 or	al,[di]			;Grab bits currently in buffer
wc3:	stosw				;Save data
	mov	al,dl			;Grab extra bits
	stosb				;and save
	ret
write_code	endp


flush		proc	near
	push	ax			;Save all registers
	push	dx
	mov	dx,offset output_data	;buffer to write
	mov	cx,ax			;AX contains number of bytes to write
	mov	bx,1			;StdOut
	mov	ah,40H			;write to file/device
	int	21H
	pop	dx
	pop	ax
	jb	File_Error		;failed somehow
	ret
flush		endp

read_char	proc	near
	mov	di,input_offset		;Anything left in buffer?
	cmp	di,input_size
	jl	rd1			;less means yes

	mov	dx,offset input_data	;buffer to read into
	mov	bx,input_handle		;input handle
;	mov	cx,1024			;read this many bytes
	mov	cx,BUFFSIZE		;TH read this many bytes
	mov	ah,3FH			;read from file/device
	int	21H
	jb	File_Error		;failed somehow, die
	or	ax,ax			;anything read?
	jz	rd2			;nope, we're finished

	mov	input_size,ax		;Save bytes read
	xor	di,di			;TH clear DI
	mov	input_offset,di	;TH 0	;Point to beginning of buffer
rd1:
;TH the mov/add instrs below are faster than the LEA.
;	lea	si,input_data[di]	;Point at character
	mov	si,offset input_data	;TH
	add	si,di			;Point at char
	lodsb				;Read it in
	inc	input_offset		;Adjust pointer
	clc				;Success
	ret

rd2:	stc				;Nothing left
	ret

File_Error:
	jmp	Terminate		;File error, terminate
					;errorlevel in AL
read_char	endp


lookup_code	proc	near
	call	index			;convert code to address
	xor	di,di	;TH 		;flag
	cmp	[si].first,-1		;Has this code been used?
	je	gc4			;equal means no

	inc	di			;set flag
	mov	bx,[si].first		;Get first entry
gc2:	call	index			;convert code to address
	cmp	[si].char,al		;is char the same?
	jne	gc3			;ne means no
	 clc				;success
	 mov	ax,bx			;put found code in ax
	 ret				;done

gc3:	cmp	[si].next,-1		;More left with this prefix?
	je	gc4			;equal means no
	 mov	bx,[si].next		;get next code
	 jmp	gc2			;try again

gc4:	stc				;not found
	ret				;done
lookup_code	endp


index		proc	near
	mov	si,bx			;si = bx * 5 (5 byte hash entries)
	shl	si,1			;si = bx * 2 * 2 + bx
	shl	si,1
	add	si,bx
	add	si,offset hash		;TH plus hash table base
	ret
index		endp

add_code	proc	near
;Only called once
	mov	bx,free_code		;Get code to use
	or	di,di	;TH		;First use of this prefix?
	je	ac1			;equal means yes
	 mov	[si].next,bx		;point last use to new entry
	 jmp	short ac2

ac1:	mov	[si].first,bx		;Point first use to new entry
ac2:	cmp	bx,MAXMAX		;Have we reached code limit?
	je	ac3			;equal means yes, just return
	 call	index			;get address of new entry
;TH switched around a little
	 mov	[si].char,al		;save suffix char
	 mov	ax,-1			;TH do this once (ok to destroy AX)
	 mov	[si].first,ax	;-1	;initialize pointers
	 mov	[si].next,ax	;-1
	 inc	free_code		;TH adjust next code
ac3:	ret
add_code	endp

;TH input/output buffers moved here outside code space
	even

output_data	equ	$			;db	1024 dup (?)
input_data	equ	output_data+BUFFSIZE	;db	1024 dup (?)

;Instead of using a separate segment for memory and the hash
;table (with all the resultant segment register fiddling),
;gonna just use a dynamic buffer right in code space.

hash	equ	input_data+BUFFSIZE	;1024

code	ends
	end	LzComp
