Newsgroups: comp.sources.misc
From: goer@midway.uchicago.edu (Richard L. Goerwitz)
Subject:  v23i070:  quranref - Holy Qur'an word and passage based retrievals, Part04/08
Message-ID: <1991Oct19.022300.12923@sparky.imd.sterling.com>
X-Md4-Signature: f2be581ed2d06ec701e8e7c057e32ebc
Date: Sat, 19 Oct 1991 02:23:00 GMT
Approved: kent@sparky.imd.sterling.com

Submitted-by: goer@midway.uchicago.edu (Richard L. Goerwitz)
Posting-number: Volume 23, Issue 70
Archive-name: quranref/part04
Environment: Icon

---- Cut Here and feed the following to sh ----
#!/bin/sh
# this is quranref.04 (part 4 of a multipart archive)
# do not concatenate these parts, unpack them in order with /bin/sh
# file huffcode.icn continued
#
if test ! -r _shar_seq_.tmp; then
	echo 'Please unpack part 1 first!'
	exit 1
fi
(read Scheck
 if test "$Scheck" != 4; then
	echo Please unpack part "$Scheck" next!
	exit 1
 else
	exit 0
 fi
) < _shar_seq_.tmp || exit 1
if test ! -f _shar_wnt_.tmp; then
	echo 'x - still skipping huffcode.icn'
else
echo 'x - continuing file huffcode.icn'
sed 's/^X//' << 'SHAR_EOF' >> 'huffcode.icn' &&
X    # 1.  That one element remaining (h[1]) is your Huffman tree.
X    #
X    # Based loosely on Sedgewick (2nd ed., 1988), p. 328-9.
X    #
X    local frst, scnd, count
X
X    until *h = 1 do {
X
X	h[1] :=: h[*h]		# Reverse first and last elements.
X	frst := pull(h)		# Pop last elem off & save it.
X	resettle_heap(h, 1)	# Resettle the heap.
X	scnd := !h		# Save (but don't clobber) top element.
X
X	count := frst.n + scnd.n
X	frst := { if *frst = 2 then frst.c else _N(frst.l, frst.r) }
X	scnd := { if *scnd = 2 then scnd.c else _N(scnd.l, scnd.r) }
X
X	h[1] := node(frst, scnd, count) # Create new node().
X	resettle_heap(h, 1)	# Resettle once again.
X    }
X
X    # H is no longer a stack.  It's single element - the root of a
X    # Huffman tree made up of node()s and leaf()s.  Put the l and r
X    # fields of that element into an _N record, and return the new
X    # record.
X    return _N(h[1].l, h[1].r)
X
Xend
X
X
Xprocedure hash_huffcodes(tr)
X
X    #
X    # Hash Huffman codes.  Tr (arg 1) is a Huffman tree created by
X    # heap_2_huffman_tree(heap).  Output is a table, with the keys
X    # representing characters, and the values being records of type
X    # hcode(i,len), where i is the Huffcode (an integer) and len is
X    # the number of bits it occupies.
X    #
X    local code, huffman_table
X
X    huffman_table := table()
X    every code := build_codes(tr) do
X	insert(huffman_table, code.c, code)
X    return huffman_table
X
Xend
X    
X
Xprocedure build_codes(tr, i, len)
X
X    #
X    # Decompose Huffman tree tr into hcode() records which contain
X    # 3 fields:  c (the character encoded), i (its integer code),
X    # and len (the number of bytes the integer code occupies).  Sus-
X    # pend one such record for each character encoded in tree tr.
X    #
X
X    if type(tr) == "string" then
X	return hcode(tr, i, len)
X    else {
X	(/len := 1) | (len +:= 1)
X	(/i   := 0) | (i   *:= 2)
X	suspend build_codes(tr.l, i, len)
X	i   +:= 1
X	suspend build_codes(tr.r, i, len)
X    }
X
Xend
X
X
Xprocedure block_encode(s, huffman_table)
X
X    #
X    # Write to file f string s encoded using huffman_table (a table having
X    # chars as keys and huffman codes as values).
X    #
X    # Create huffman_table as follows (char_tbl is a table, with chars as
X    # keys and frequencies as values):
X    #
X    # heap  := heap_init(char_tbl)
X    # hufftree := heap_2_huffman_tree(heap)
X    # huffman_table  := hash_huffcodes(hufftree)
X    #
X    # Store the tree, hufftree.  Pass the huffman table to block_encode as
X    # its second argument.
X
X    local s2, size, hcode_4_chr, chr
X
X    *s > 2r1111111111111111 &
X	stop("write_string:  too many characters in s")
X
X    s2 := ""			# initialize size string
X    outbits()			# just in case
X    every s2 ||:= outbits(*s, 16) # block size = 2 bytes
X
X    s ? {
X	while chr := move(1) do {
X	    hcode_4_chr := \huffman_table[chr] |
X		stop("write_string:  unexpected char, ",image(chr))
X	    every s2 ||:= outbits(hcode_4_chr.i, hcode_4_chr.len)
X	}
X	s2 ||:= outbits()
X    }
X
X    return s2
X
Xend
X
X
Xprocedure block_decode(f, huff_tree)
X
X    # Undo what block_encode does.
X    
X    local how_many, s2, E, chr, bit
X
X    s2 := ""
X
X    # The first two bytes record how many characters the original
X    # text had in it.  If the read fails, it means that the file
X    # system filled up while making the index, and the bitmaps now
X    # can't be located in f.
X    how_many := ishift(ord(reads(f)), 8) + ord(reads(f)) |
X	stop("block_decode:  failure reading ",image(f))
X    # If the original text was blank (zero characters), then return
X    # an empty string.
X    if how_many = 0 then { return "" }
X
X    reads(f, how_many) ? {
X
X	# Otherwise, set E = to the top node of the Huffman tree, and
X	# begin decoding.
X	E := huff_tree
X	while chr := move(1) do {
X	    every bit := iand(1, ishift(ord(chr), -7 to 0)) do {
X		E := { if bit = 0 then E.l else E.r }
X		if s2 ||:= string(E) then {
X		    if *s2 = how_many
X		    then return s2
X		    else E := huff_tree
X		}
X	    }
X	}
X    }
X
X    # If we get to here, something is quite amiss!
X    stop("read_string:  bad character count")
X
Xend
SHAR_EOF
echo 'File huffcode.icn is complete' &&
true || echo 'restore of huffcode.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= binsrch.icn ==============
if test -f 'binsrch.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping binsrch.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting binsrch.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'binsrch.icn' &&
X############################################################################
X#
X#	Name:	 binsrch.icn
X#
X#	Title:	 general-purpose binary index search
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.4
X#
X############################################################################
X#
X#  This file contains a single procedure, binary_index_search(str,
X#  filename), which goes through a file called filename looking for a
X#  line beginning with str.  Note well that binary_index_search()
X#  assumes lines in filename will contain more than str.  Str must
X#  occupy the first part of the line, separated from the remainder by
X#  a tab.
X#
X############################################################################
X#
X#  Links: none
X#
X#  See also: retrieve.icn, makeind.icn
X#
X############################################################################
X
X
Xprocedure binary_index_search(entry, index_filename)
X
X    local in_index, bottom, top, loc, incr, firstpart, offset
X
X    in_index := open(index_filename) |
X	abort("binary_index_search","can't open "||index_filename,18)
X
X    bottom := 1
X    seek(in_index, 0)
X    top := where(in_index)
X
X    # If bottom gets bigger than top, there's no such entry.
X    until bottom > top do {
X
X	loc := (top+bottom) / 2
X	seek(in_index, loc)
X
X	# Move past next newline.  If at bottom, break.
X	incr := 1
X	until reads(in_index) == "\n" do
X	    incr +:= 1
X	if loc+incr = bottom then {
X	    top := loc-1
X	    next
X	}
X
X	# Check to see if the current line starts with entry (arg 1).
X	read(in_index) ? {
X
X	    # .IND file line format is entry\tbitmap-file-offset
X	    if entry == (firstpart := tab(find("\t"))) then {
X		# return offset
X		return (move(1), tab(0))
X	    }
X	    # Ah, this is what all binary searches do.
X	    else {
X		if entry << firstpart
X		then top := loc-1
X		else bottom := loc + incr + *&subject
X	    }
X	}
X    }
X
Xend
SHAR_EOF
true || echo 'restore of binsrch.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= bmp2text.icn ==============
if test -f 'bmp2text.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping bmp2text.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting bmp2text.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'bmp2text.icn' &&
X############################################################################
X#
X#	Name:	 bmp2text.icn
X#
X#	Title:	 convert a bitmap to a text-chunk
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 2.6
X#
X############################################################################
X#
X#  This file contains bitmap_2_text(bitmap, filename).  Recall that
X#  bitmaps are just a series of fixed-length bitfields used to mark
X#  divisions within a text.  The procedure retrieve() locates words in
X#  an index file, and returns a list of these bitmaps, which point to
X#  divisions within the original text file - divisions within which a
X#  given indexed word found by retrieve() occurs.  The procedure
X#  bitmap_2_filename() simply takes a given bitmap and finds the text
X#  with which it is associated in the main text file.
X#
X#  Note that bitmap_2_text() does not seek directly to the correct
X#  location within "filename" (arg 2).  It first breaks down the
X#  bitmap into a less precise form via an offset table (read in via
X#  the .OFS file), looks up the precise location of the bitmap in the
X#  .UNT file, and then finally seeks up to that location in the main
X#  text file, decodes the text it finds at that location, and then
X#  returns the decoded section as a string.  The reason
X#  bitmap_2_text() does this is that makeind (the indexing routine
X#  which creates data files for retrieve() and bitmap_2_text()) does
X#  not store the offset within the main text for every bitmap.  It
X#  just saves the locations of major blocks in the .OFS file, then
X#  keeps a full list on disk in the .UNT file.  This is basically just
X#  a space-saving device.  It would eat up too much core memory to
X#  keep a list of every offset for every chunk of text marked out by a
X#  bitmap in filename.
X#
X#  Note also that, although retrieve() returns a list of bitmaps, bit-
X#  map_2_text(bitmap, filename) expects a single bitmap as its first
X#  argument.  It is better that text be retrieved as needed, one chunk
X#  at a time, and not stuffed en masse into core memory as soon as it
X#  is retrieve()'d.
X#
X############################################################################
X#
X#  Links: ./indexutl.icn, ./initfile.icn
X#
X#  See also: retrieve.icn, makeind.icn
X#
X############################################################################
X
X# Declared in indexutl.icn.
X# record is(FS, s_len, len, no, is_case_sensitive, hufftree)
X# global IS
X
X# Declared in initfile.icn.
X# global filestats
X# record Fs(ind_filename, bmp_filename, lim_filename, unt_filename,
X#           IS, ofs_table)
X
Xprocedure bitmap_2_text(bitmap, filename)
X
X    local cut_down_bitmap, upto_field, offset, line, base_value_mask,
X	base_value, location
X    static t, old_main_filename, in_main_file, in_unt_file
X    # global filestats, IS
X    initial {
X	t := table()
X	old_main_filename := ""
X    }
X
X    # Check for sloppy programming.
X    /filename & abort("bitmap_2_text","you called me without a filename",29)
X
X    if old_main_filename ~==:= filename then {
X	#
X        # If necessary, initialize stats for the current file.
X        #
X	if /filestats | /filestats[filename]
X	then initfile(filename)           # see initfile.icn
X        # open full text file for reading
X	every close(\in_main_file | \in_unt_file)
X	in_main_file := open(filename) |
X	    abort("bitmap_2_text", "can't open "||filename, 26)
X	in_unt_file := open(filestats[filename].unt_filename) |
X	    abort("bitmap_2_text", "can't open .UNT file for "||filename, 27)
X    }
X
X    # Reset IS to current file.
X    IS := filestats[filename].IS
X
X    # Determine offset to seek to by using the bitmap->offset table
X    # for the current file (arg 2).  The name of the bitmap_offset
X    # table is stored in filestats[filename].ofs_table.
X    #
X    upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
X    cut_down_bitmap := ishift(bitmap, -(IS.no - upto_field) * IS.len)
X    offset := \filestats[filename].ofs_table[cut_down_bitmap] | fail
X
X    # Seek to offset, and begin looking for the string equiv. of
X    # bitmap (arg 1).
X    #
X    seek(in_unt_file, offset) |
X	abort("bitmap_2_text","can't seek to offset "||offset, 27)
X    
X    #
X    # Find the major text division for bitmap using the offset table
X    # (in filestats[filename].ofs_table), look up its precise loca-
X    # tion in the .UNT file, then seek to that location in the main
X    # text file and decode whatever text is encoded at that location.
X    #
X
X    #
X    # First figure out how to tell if we've gone too far.  Basically,
X    # mask out the lower bits, and record the value of the upper bits.
X    # Some fooling around is necessary because bitmaps may use large
X    # ints, making it impossible to use icom() in a naive manner.
X    # If the upper bits of the bitmaps being read change, then we've
X    # gone too far.
X    #
X    base_value_mask := icom(2^((IS.no - upto_field) * IS.len)- 1)
X    base_value := iand(bitmap, base_value_mask)
X
X    while line := read(in_unt_file) do {
X	line ? {
X	    location := digits_2_bitmap(tab(find("\t"))) # in indexutl.icn
X	    if bitmap = location then {
X		move(1)		# move past tab character
X		# block_decode() is in huffcode.icn; decodes the encoded
X		# verse and returns the result (should be an ASCII string)
X		seek(in_main_file, offset := integer(tab(0))) |
X		    abort("bitmap_2_text","unable to seek to "||offset,28)
X		return block_decode(in_main_file, IS.hufftree)
X	    }
X	    else {
X		if base_value ~= iand(location, base_value_mask)
X		then fail
X	    }
X	}
X    }
X
X    # we should have returned by now
X    fail
X
Xend
SHAR_EOF
true || echo 'restore of bmp2text.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= initfile.icn ==============
if test -f 'initfile.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping initfile.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting initfile.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'initfile.icn' &&
X############################################################################
X#
X#	Name:	 initfile.icn
X#
X#	Title:	 initialize entry for file in filestats table
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 2.2
X#
X############################################################################
X#
X#  This file contains initfile(filename), which creates a set of stats
X#  for the indexed database contained in filename.  Uses several global
X#  structures, primarily for speed.  Beware.
X#
X############################################################################
X#
X#  See also: retrieve.icn, bmp2text.icn, retrops.icn
X#
X############################################################################
X
X# Used to store stats for each filename.
Xrecord Fs(ind_filename, bmp_filename, lim_filename, unt_filename,
X	  IS, ofs_table)
X
X# IS is declared in indexutl.icn.
X# global IS
X
Xglobal filestats
X
Xprocedure initfile(filename)
X
X    # Messy procedure which creates and stores the names of several
X    # files that will be repeatedly used with "filename."  Reads in
X    # the stats for filename from that file's .IS file.  Also reads in
X    # the bitmap->offset (.OFS file) table, and puts it into
X    # filestats[filename].ofs_table for later (re-)use.  The bitmap->
X    # offset table contains pointers into the .UNT file for filename,
X    # which lists all the main text divisions, with pointers into the
X    # main text file (i.e. filename) for each division.  The scheme
X    # is: .OFS file (locates larger divisions) -> .UNT file (contains
X    # the offsets for smaller divisions in filename) -> filename (the
X    # actual compressed text).
X
X    local IS_filename, in_IS, upto_field, stored_bitmap_length,
X	ofs_filename, intext, cut_down_bitmap, block_size, offset
X    # global filestats
X    initial {
X	filestats := table()
X	# OS-specific parameters are initialized here.
X	initialize_os_params()	# in indexutl.icn
X    }
X
X    # Check for sloppy programming.  Did we do this one already??
X    if not (/filestats[filename] := Fs(,,,,,table())) then fail
X
X    filestats[filename].ind_filename :=
X	dir_name(filename)||create_fname(filename, "IND")
X    filestats[filename].bmp_filename :=
X	dir_name(filename)||create_fname(filename, "BMP")
X    filestats[filename].lim_filename :=
X	dir_name(filename)||create_fname(filename, "LIM")
X    filestats[filename].unt_filename :=
X	dir_name(filename)||create_fname(filename, "UNT")
X
X    # Decode stored IS record for filename.
X    IS_filename := dir_name(filename)||create_fname(filename, "IS")
X    in_IS := open(IS_filename) | abort("bitmap_2_text",
X	"Can't open "||IS_filename||".  Did you forget to index?", 24)
X    filestats[filename].IS := decode(!in_IS)
X    close(in_IS)
X	
X    # Having decoded IS, we can now determine the length of the cut-
X    # down bitmaps stored in the .OFS file for filename.
X    upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
X    stored_bitmap_length :=
X	((filestats[filename].IS.len * upto_field) <= seq(0,8))
X
X    # open .OFS file
X    ofs_filename := dir_name(filename)||create_fname(filename, "OFS")
X    intext := open(ofs_filename) |
X	abort("bitmap_2_text", "can't open "||ofs_filename, 23)
X    
X    # read in blocks from .OFS file, breaking them into their
X    # constituent parts
X    while block_size := read_int(intext, 8) * 8 do {
X	cut_down_bitmap := read_int(intext, stored_bitmap_length)
X	offset := read_int(intext, block_size - stored_bitmap_length)
X	insert(filestats[filename].ofs_table, cut_down_bitmap, offset)
X    }
X    close(intext)
X
X    return *filestats[filename].ofs_table
X
Xend
SHAR_EOF
true || echo 'restore of initfile.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= retrieve.icn ==============
if test -f 'retrieve.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping retrieve.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting retrieve.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'retrieve.icn' &&
X############################################################################
X#
X#	Name:	 retrieve.icn
X#
X#	Title:	 retrieve locations of words in database file
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.25
X#
X############################################################################
X#
X#  Retrieve(pattern, filename) retrieves all locations containing
X#  words matching pattern (arg1) in filename (arg2), placing them in a
X#  list.  "Locations" are integer-coded pointers to places in filename
X#  where corresponding text is located.  To actually retrieve that
X#  block of text, you must call bitmap_2_text(location, filename).
X#  Retrieve() only gathers up a list of locations in filename
X#  containing words which match pattern.
X#
X#  The reason retrieve() doesn't do the logical thing - namely, to
X#  "retrieve" text itself - is that doing so might use a *lot* of
X#  memory.  It is far more economical to retrieve text only when a
X#  given chunk is requested via bitmap_2_text().
X#
X#  Note:  Patterns must match words in their entirety.  For instance,
X#  retrieve("dog",filename) would only retrieve exact matches for the
X#  word "dog" in filename.  To catch, say, "doggie" as well, it would
X#  be necessary to call retrieve with a regular expression that
X#  matched both dog and doggie (e.g. retrieve("dog.*",filename)).
X#
X############################################################################
X#
X#  Links: codeobj.icn, ./indexutl.icn, ./binsrch.icn, ./initfile.icn
X#         ./findre.icn shquote.icn
X#
X#  See also: makeind.icn, bmp2text.icn
X#
X############################################################################
X
Xlink codeobj, shquote
X
X# The following globals contain stats for current file (here, arg2).
X# global filestats    # declared in initfile.icn
X# global IS           # declared in indexutl.icn
X
Xprocedure retrieve(pattern, filename, inverse)
X
X    local bitmap_set, bmp_file, in_egrep, intext, cmd,
X	offset, pattern2, line
X    static is_UNIX, egrep_filename
X    initial {
X	if is_UNIX := find("UNIX",&features) then
X	    # If egrep is available, use it.  It's fast.
X	    egrep_filename := "egrep"
X	    # egrep_filename := "/usr/local/bin/gnuegrep"
X    }
X
X    # Check for sloppy programming.
X    /filename & abort("retrieve","you called me without a filename",22)
X
X    # Initialize important variables.
X    #
X    if /filestats | /filestats[filename]
X    then initfile(filename)           # see initfile.icn
X    bitmap_set := set()		      # list will contain locations of hits
X    IS := filestats[filename].IS      # re-initialize IS for current file
X    if /IS.is_case_sensitive then
X	pattern := map(pattern)
X
X    # Open bitmap file.
X    #
X    bmp_file := open(filestats[filename].bmp_filename) |
X	abort("retrieve","can't open "||filestats[filename].bmp_filename, 29)
X
X    # Search index.
X    #
X    if are_metas(pattern) then {
X    # NB: are_metas() can be found in indexutl.icn
X
X	# If there are metacharacters in pattern, do a regexp pattern match.
X	# The .IND file goes:  line ::= key \t other-stuff.
X	pattern := "^(" || pattern || ")\t"
X
X	# If UNIX, then use egrep to search index.
X	#
X	if \is_UNIX then {
X
X	    # Set up command line to be passed to /bin/sh.  First make
X	    # sure we don't have any apostrophe's hanging around to
X	    # screw up the command line to be passed to /bin/sh, then
X	    # put together a command line to be passed to egrep.
X	    pattern2 := shquote(pattern)                # from the IPL
X
X	    cmd := egrep_filename || " " || pattern2 ||
X		" " || filestats[filename].ind_filename ||
X		" 2>&1"
X	    # open pipe
X	    in_egrep := open(cmd, "rp") |
X		abort("retrieve","can't open pipe from\n\t"||cmd, 20)
X	    # grep .IND index file
X	    every line := !in_egrep do {
X		# Kludge, but it's the only way to tell if there's an error.
X		find("error"|"grep", line) & {
X		    # Define some routine here that issues a warning; there
X		    # is no need to actually abort!
X		    (\err_message)("Regular expression syntax error.") |
X			stop("retrieve:  regexp syntax error")
X		    break
X		}
X		line ? (tab(find("\t")+1), offset := integer(tab(0)))
X		bitmap_set ++:=
X		    retrieve_bitmaps(offset, bmp_file)
X	    }
X	    close(in_egrep)
X
X	# ...otherwise (i.e. if not UNIX) use findre() instead of egrep
X	#
X	} else {
X
X	    # Probably MS-DOS or something else.  SLOW, SLOW!
X	    intext := open(filestats[filename].ind_filename) |
X		abort("retrieve","can't open index file", 21)
X	    # grep .IND file
X	    every line := !intext do {
X		line ? {
X		    if findre(pattern) then {
X			offset := integer(tab(0))
X			bitmap_set ++:=
X			    retrieve_bitmaps(offset, bmp_file)
X		    }
X		}
X	    }
X	    every close(bmp_file | intext)
X	}
X
X    # If *not* are_metas(pattern), then do a binary search of the index.
X    # No need to worry about is_UNIX, egrep, findre(), etc.
X    #
X    } else {
X	if offset :=
X	    binary_index_search(pattern, filestats[filename].ind_filename)
X	then bitmap_set ++:=
X	    retrieve_bitmaps(offset, bmp_file)
X    }
X
X    # If inverse (arg 3) is nonnull, then invert the sense of the search.
X    # Do this by knocking out those parts of the full bitmap set that are
X    # in the bitmap_set, and then assigning the result to bitmap_set.
X    #
X    if \inverse then
X	bitmap_set := (all_bitmaps(bmp_file) -- bitmap_set)
X
X    # We're done.  See if there were any hits.  If so, sort & return a
X    # list (lists are easier for the display routines to handle).
X    #
X    close(bmp_file)
X    #
X    if *bitmap_set > 0
X    then return sort(bitmap_set)
X    else fail
X
Xend
X
X
X
Xprocedure retrieve_bitmaps(offset, f, return_a_list)
X
X    local bitmap_list, bitmap_length, i, tmp, how_many_bitmaps,
X	bits_needed, inverse_signal
X
X    bits_needed := 24
X    inverse_signal := 8388608
X    
X    seek(f, offset)
X    bitmap_length := ((IS.len * IS.no) <= seq(0,8))
X    tmp := read_int(f, bits_needed)
X    how_many_bitmaps := iand(inverse_signal-1, tmp)
X
X    # Slower way.
X    # bitmap_list := list(how_many_bitmaps)
X    # every i := 1 to how_many_bitmaps do
X    #     bitmap_list[i] := read_int(f, bitmap_length)
X
X    # Slow way.
X    bitmap_list := list()
X    every i := 1 to how_many_bitmaps do
X	put(bitmap_list, read_int(f, bitmap_length))
X
X    # If the inverse signal bit is turned on, then the BMP file stores
X    # non-occurrences for a given key (rather than occurrences).  Saves
X    # space for a/the/and, etc., but necessitates collecting all bitmaps
X    # for the current file into a set a set difference.  The procedure
X    # all_bitmaps does the collecting.
X    if iand(inverse_signal, tmp) ~= 0 then {
X	bitmap_list := (all_bitmaps(f) -- set(bitmap_list))
X    }
X
X    if \return_a_list
X    then return sort(bitmap_list)
X    else {
X	if type(bitmap_list) == "list"
X	then return set(bitmap_list)
X	else return bitmap_list
X    }
X
Xend
X
X
X
Xprocedure all_bitmaps(f, return_a_list)
X
X    # At offset 1 in the BMP file is the list of all bitmaps in the
X    # full file.  Returns the set of these, unless a list is desired,
X    # in which case one must call all_bitmaps() with a nonnull second
X    # argument.
X    return retrieve_bitmaps(1, f, return_a_list)
X
Xend
SHAR_EOF
true || echo 'restore of retrieve.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= indexutl.icn ==============
if test -f 'indexutl.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping indexutl.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting indexutl.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'indexutl.icn' &&
X############################################################################
X#
X#	Name:	 indexutl.icn
X#
X#	Title:	 indexing utilities
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 2.1
X#
X############################################################################
X#
X#  This file contains base_name(), dir_name(), get_index_fname(),
X#  stripchars(), abort(), and gettokens().
X#
X#  base_name(s), dir_name(s)	- like the Unix system commands
X#  create_fname(fname,ext)	- get a new filename based on fname + ext
X#  stripchars(s,c)		- strip chars c from string s
X#  abort(proc,msg,ecode)	- abort procedure proc with exit code ecode
X#  write_int(f, int, size)	- breaks int into 8-bit chunks & writes to f
X#  read_int(f, int, size)	- like write_int, only constructs int from f
X#  are_metas(pattern)		- succeeds if pattern has egrep-style metas
X#  digits_2_bitmap(s)		- converts string 01:13:94 to an int-bitmap
X#
X############################################################################
X#
X#  Links: ./findre.icn, radcon.icn, bincvt.icn
X#
X#  See also: retrieve.icn, retrops.icn, bmp2text.icn, makeind.icn
X#
X############################################################################
X
X#
X# All from the IPL.
X#
Xlink radcon, bincvt
X
X#
X# FS = field separator, s_len = string length of fields, len =
X# byte length of fields, no = number of fields, is_case_sensitive =
X# whether to map index entries to lowercase, r_field = rollover
X# field for limits file.
X#
Xrecord is(FS, s_len, len, no, is_case_sensitive, r_field, hufftree)
Xglobal _slash, _baselen, IS
X
X
Xprocedure base_name(s)
X
X    # If s == "/usr/local/man/man1/icon.1", base_name will return
X    # "icon.1".  Somewhat like the Unix basename system command.
X
X    # global _slash		# _slash = \ for MS-DOS, / for Unix
X    s ? {
X	while tab(find(_slash)+1)
X	return tab(0)
X    }
X
Xend
X
X
X
Xprocedure dir_name(s)
X
X    # If s == "/usr/local/man/man1/icon.1", dir_name will return
X    # "/usr/local/man/man1".  Somewhat like the Unix dirname system
X    # command.
X
X    local s2
X    # global _slash		# _slash = \ for MS-DOS, / for Unix
X
X    s2 := ""
X    s ? {
X	while s2 ||:= tab(find(_slash)+1)
X	return s2
X    }
X
Xend
X
X
X
Xprocedure create_fname(FNAME, EXT)
X
X    #
X    # Discard path component.  Cut basename down to a small enough
X    # size that the OS will be able to handle addition of the ex-
X    # tension, EXT.
X    #
X
X    # global _slash, _baselen
X
X    *EXT > 3 &
X	abort("get_index_fname","extension too long",7)
X
X    return right(
X	stripchars(base_name(FNAME,_slash),'.'), _baselen, "x") ||
X	    "." || EXT
X
Xend
X
X
X
Xprocedure stripchars(s,c)
X
X    # Strip chars (c) from string (s).  Return stripped s.
X
X    local s2
X
X    s2 := ""
X    s ? {
X	while s2 ||:= tab(upto(c))
X	do tab(many(c))
X	s2 ||:= tab(0)
X    }
X    return s2
X
Xend
X
X
X
Xprocedure abort(proc_name, message, error_code)
X
X    if not (/proc_name := "") then
X	proc_name := trim(proc_name, ': ') || ":  "
X    /error_code := 1
X    
X    write(&errout, proc_name, \message) # fail if there's no error msg,
X    exit(error_code)		 # then abort
X
Xend
X
X
X
Xprocedure write_int(f, i, size)
X
X    # Write out an integer byte-by-byte.
X    #
X    # Important little routine.  I know it looks inelegant and slow.
X    # Feel free to modify it for speed, and send me the results.
X    # Don't knock out the old code, though.  You understood it when
X    # you read it, right?  That's the idea :-).
X
X    local marker, how_many
X
X    marker := ""
X    how_many := 0
X    /size := (*exbase10(i,2) <= seq(0,8))
X
X    # output bytes most significant first; then least significant
X    until (size -:= 8) <= -8 do {
X	how_many +:= 1
X	marker ||:= (f, char(iand(ishift(i, -size), 2r11111111)))
X    }
X
X    writes(f, marker)
X    return how_many		# number of characters written
X
Xend
X
X
X
Xprocedure read_int(f, size)
X
X    local i, _shift
X
X    # collect bytes, putting the first one read into the high
X    # end of an integer, and on down to the last read (into the
X    # low end)
X    i := _shift := 0
X    while (_shift +:= 8) <= size do
X	i +:= ishift(ord(reads(f)), size - _shift) | fail
X    return i
X
Xend
X
X
X
Xprocedure initialize_os_params()
X
X    local os
X    # global _slash, _baselen
X
X    if find("MS-DOS", os := &features) then {
X	_slash := "\\";	_baselen := 8
X    }
X    else if find("UNIX", os := &features) then {
X	_slash := "/"; _baselen := 10
X    }
X    else abort("initialize_os_params","os parameters undefined", 6)
X
X    return os
X
Xend
X
X
Xprocedure are_metas(str)
X
X    local chr, tmp
X
X    str ? {
X
X	# String-initial metacharacters are meaningless.
X	tab(many('*+?|'))
X
X	# Look for metacharacters and backslashes.
X	while tab(upto('\\*+()|?.$^[')) do {
X
X	    # If a backslash comes first, then the next character can't
X	    # be a meta.  Move past it, and try again.
X	    if ="\\" then move(1) |
X		abort("are_metas","malformed \-escape sequence",19)
X	    # Otherwise, we have a metacharacter.  Return its position
X	    # in str.  Dereference just so as not to have a global var.
X	    # on the loose.
X	    else return .&pos
X	}
X
X    }
X
X    # If we've gotten this far without returning, then the string is
X    # clean of metacharacters, and (in boolean terms) the procedure
X    # are_metas() returns false.
X    fail
X
Xend
X
X
X#
X# digits_2_bitmap
X#
X# Converts a string representation of a set of bit-fields into an
X# integer.  I.e. 1:1:3 becomes binary 010111 (decimal 23).  This
X# integer is like a map, and is called, in text-processing circles,
X# a bitmap (not to be confused with bit-mapped display techniques).
X# 
Xprocedure digits_2_bitmap(s)
X
X    # s        = location string (e.g. 10:02:03:75)
X    # IS.s_len = the string length of fields in s (3 in the above example)
X    # IS.len   = the number of bits needed to hold an integer
X    #             representation of a single field
X    # IS.no    = number of fields in s (4 in the above example)
X    #
X    # Fixed field lengths make things much simpler, but a whole
X    # helluva lot less economical.  Be sure that (IS.len * IS.no) does
X    # not exceed the register width for your CPU if either a) your
X    # implementation has no limits on the size of integers, or b) you
X    # are really concerned about performance.  Otherwise, never mind.
X
X    local bitmap, field, no
X
X    no       := IS.no
X    bitmap   := 0
X
X    s ? {
X	if upto(~&digits) then {
X	    # The bitmap is delineated by field-markers (e.g. 11;23).
X	    tab(upto(&digits))
X	    while field := tab(many(&digits)) do {
X		no -:= 1
X		tab(upto(&digits))
X		bitmap +:= ishift(field, no * IS.len)
X	    }
X	} else {
X	    # Yuck!  An un-delineated bitmap (e.g. 23423).
X	    while field := integer(move(IS.s_len)) do {
SHAR_EOF
true || echo 'restore of indexutl.icn failed'
fi
echo 'End of  part 4'
echo 'File indexutl.icn is continued in part 5'
echo 5 > _shar_seq_.tmp
exit 0

exit 0 # Just in case...
-- 
Kent Landfield                   INTERNET: kent@sparky.IMD.Sterling.COM
Sterling Software, IMD           UUCP:     uunet!sparky!kent
Phone:    (402) 291-8300         FAX:      (402) 291-4362
Please send comp.sources.misc-related mail to kent@uunet.uu.net.
