From pa.dec.com!decwrl!uunet!sparky!kent Fri Jul 19 14:07:53 PDT 1991
Article: 2492 of comp.sources.misc
Newsgroups: comp.sources.misc
Path: pa.dec.com!decwrl!uunet!sparky!kent
From: Richard Goerwitz <goer@midway.uchicago.edu>
Subject:  v20i101:  bibleref - Word and passage retrievals from King James Bible, Part03/07
Message-ID: <1991Jul18.030146.29356@sparky.IMD.Sterling.COM>
X-Md4-Signature: 1eba3d87cbcd3afa6c2d3074f1084c9c
Sender: kent@sparky.IMD.Sterling.COM (Kent Landfield)
Organization: Sterling Software, IMD
References: <csm-v20i099=bibleref.215242@sparky.imd.sterling.com>
Date: Thu, 18 Jul 1991 03:01:46 GMT
Approved: kent@sparky.imd.sterling.com
Lines: 1040

Submitted-by: Richard Goerwitz <goer@midway.uchicago.edu>
Posting-number: Volume 20, Issue 101
Archive-name: bibleref/part03
Environment: ICON, UNIX

---- Cut Here and feed the following to sh ----
#!/bin/sh
# this is bibleref.03 (part 3 of a multipart archive)
# do not concatenate these parts, unpack them in order with /bin/sh
# file rewrap.icn continued
#
if test ! -r _shar_seq_.tmp; then
	echo 'Please unpack part 1 first!'
	exit 1
fi
(read Scheck
 if test "$Scheck" != 3; then
	echo Please unpack part "$Scheck" next!
	exit 1
 else
	exit 0
 fi
) < _shar_seq_.tmp || exit 1
if test ! -f _shar_wnt_.tmp; then
	echo 'x - still skipping rewrap.icn'
else
echo 'x - continuing file rewrap.icn'
sed 's/^X//' << 'SHAR_EOF' >> 'rewrap.icn' &&
X		return line || tab(0)
X	    }
X	}
X
X	# Keep the extra section of s in a buffer.
X	old_line := tab(0)
X
X	# If the reason the remaining section of s was unrewrapable was
X	# that it was too long, and couldn't be broken up, then just return
X	# the thing as-is.
X	if *old_line > i then {
X	    old_line ? {
X		if extra_bit := tab(upto(' -')+1) || (tab(many(' ')) | "")
X		then old_line := tab(0)
X		else extra_bit := old_line & old_line := ""
X		return trim(extra_bit)
X	    }
X	}
X	# Otherwise, clean up the buffer for prepending to the next s.
X	else {
X	    # If old_line is blank, then don't mess with it.  Otherwise,
X	    # add whatever is needed in order to link it with the next s.
X	    if old_line ~== "" then {
X		# If old_line ends in a dash, then there's no need to add a
X		# space to it.
X		if old_line[-1] ~== "-"
X		then old_line ||:= " "
X	    }
X	}
X    }
X    
Xend
X
X
X
Xprocedure EndToFront(i)
X    # Goes with rewrap(s,i)
X    *&subject+1 - &pos >= i | fail
X    suspend &subject[.&pos:&pos <- &pos+i to &pos by -1]
Xend
SHAR_EOF
echo 'File rewrap.icn is complete' &&
true || echo 'restore of rewrap.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= binsrch.icn ==============
if test -f 'binsrch.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping binsrch.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting binsrch.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'binsrch.icn' &&
X############################################################################
X#
X#	Name:	 binsrch.icn
X#
X#	Title:	 general-purpose binary index search
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.4
X#
X############################################################################
X#
X#  This file contains a single procedure, binary_index_search(str,
X#  filename), which goes through a file called filename looking for a
X#  line beginning with str.  Note well that binary_index_search()
X#  assumes lines in filename will contain more than str.  Str must
X#  occupy the first part of the line, separated from the remainder by
X#  a tab.
X#
X############################################################################
X#
X#  Links: none
X#
X#  See also: retrieve.icn, makeind.icn
X#
X############################################################################
X
X
Xprocedure binary_index_search(entry, index_filename)
X
X    local in_index, bottom, top, loc, incr, firstpart, offset
X
X    in_index := open(index_filename) |
X	abort("binary_index_search","can't open "||index_filename,18)
X
X    bottom := 1
X    seek(in_index, 0)
X    top := where(in_index)
X
X    # If bottom gets bigger than top, there's no such entry.
X    until bottom > top do {
X
X	loc := (top+bottom) / 2
X	seek(in_index, loc)
X
X	# Move past next newline.  If at bottom, break.
X	incr := 1
X	until reads(in_index) == "\n" do
X	    incr +:= 1
X	if loc+incr = bottom then {
X	    top := loc-1
X	    next
X	}
X
X	# Check to see if the current line starts with entry (arg 1).
X	read(in_index) ? {
X
X	    # .IND file line format is entry\tbitmap-file-offset
X	    if entry == (firstpart := tab(find("\t"))) then {
X		# return offset
X		return (move(1), tab(0))
X	    }
X	    # Ah, this is what all binary searches do.
X	    else {
X		if entry << firstpart
X		then top := loc-1
X		else bottom := loc + incr + *&subject
X	    }
X	}
X    }
X
Xend
SHAR_EOF
true || echo 'restore of binsrch.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= bmp2text.icn ==============
if test -f 'bmp2text.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping bmp2text.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting bmp2text.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'bmp2text.icn' &&
X############################################################################
X#
X#	Name:	 bmp2text.icn
X#
X#	Title:	 convert a bitmap to a text-chunk
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.14
X#
X############################################################################
X#
X#  This file contains bitmap_2_text(bitmap, filename).  Recall that
X#  bitmaps are just a series of fixed-length bitfields used to mark
X#  divisions within a text.  The procedure retrieve() finds words in
X#  an index file, and returns a list of these bitmaps, which point to
X#  divisions within the original text file - divisions within which a
X#  given indexed word found by retrieve() occurs.  The procedure
X#  bitmap_2_filename() simply takes a given bitmap and finds the text
X#  with which it is associated in the full text file.
X#
X#  Note that bitmap_2_text() does not seek directly to the correct
X#  location within "filename" (arg 2).  It first breaks down the
X#  bitmap into a less precise form, looks up the location of that
X#  form, seeks up to its location, and then bumbles along until it
X#  reaches the chunk of text corresponding to the full "bitmap" (arg
X#  1).  The reason bitmap_2_text() does this is that makeind (the
X#  indexing routine which creates data files for retrieve() and
X#  bitmap_2_text()) does not store the offset within filename for
X#  every bitmap.  It just saves the locations of major blocks.  This
X#  is basically just a space-saving device.  It would eat up too much
X#  memory (both disk and core) to keep a list of every offset for
X#  every chunk of text marked out by a bitmap in filename.
X#
X#  Note also that, although retrieve() returns a list of bitmaps, bit-
X#  map_2_text(bitmap, filename) expects a single bitmap as its first
X#  argument.  It is better that text be retrieved as needed, one chunk
X#  at a time, and not stuffed en masse into core memory as soon as it
X#  is retrieve()'d.
X#
X############################################################################
X#
X#  Links: ./indexutl.icn, ./initfile.icn
X#
X#  See also: retrieve.icn, makeind.icn
X#
X############################################################################
X
X# Declared in indexutl.icn.
X# record is(FS, s_len, len, no, is_case_sensitive)
X# global IS
X
X# Declared in initfile.icn.
X# global filestats
X# record Fs(ind_filename, bmp_filename, lim_filename, IS, ofs_table)
X
Xprocedure bitmap_2_text(bitmap, filename)
X
X    local intext, cut_down_bitmap, upto_field, offset, line, value,
X	base_value_mask, base_value, location
X    static t
X    # global filestats, IS
X    initial t := table()
X
X    # Check for sloppy programming.
X    /filename & abort("bitmap_2_text","you called me without a filename",29)
X
X    # If necessary, initialize stats for the current file.
X    #
X    if /filestats | /filestats[filename]
X    then initfile(filename)           # see initfile.icn
X    # Reset IS to current file.
X    IS := filestats[filename].IS
X
X    # open full text file for reading
X    intext := open(filename) |
X	abort("bitmap_2_text", "can't open "||filename, 26)
X
X    # Determine offset to seek to by using the bitmap->offset table
X    # for the current file (arg 2).  The name of the bitmap_offset
X    # table is stored in filestats[filename].ofs_table.
X    #
X    upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
X    cut_down_bitmap := ishift(bitmap, -(IS.no - upto_field) * IS.len)
X    offset := \filestats[filename].ofs_table[cut_down_bitmap] | fail
X
X    # Seek to offset, and begin looking for the string equiv. of
X    # bitmap (arg 1).
X    #
X    seek(intext, offset) |
X	abort("bitmap_2_text","can't seek to offset "||offset, 27)
X
X    #
X    # This works a lot like the routine in gettext.icn (another related
X    # retrieval package).  Note that bitmaps in "filename" (arg 2) are on
X    # their own lines, preceded by a double colon.
X    #
X    # First figure out how to tell if we've gone too far.  Basically,
X    # mask out the lower bits, and record the value of the upper bits.
X    # Some fooling around is necessary because bitmaps may use large
X    # ints, making it impossible to use icom() in a naive manner.
X    # If the upper bits of the bitmaps being read change, then we've
X    # gone too far.
X    #
X    base_value_mask := icom(2^((IS.no - upto_field) * IS.len)- 1)
X    base_value := iand(bitmap, base_value_mask)
X
X    while line := read(intext) do {
X	line ? {
X	    if ="::" then {
X		location := digits_2_bitmap(tab(0)) # in indexutl.icn
X		if bitmap = location
X		then {
X		# Collect all text upto the next colon+colon-initial
X		# line (::) or EOF.
X		    value := ""
X		    while line := read(intext) do {
X			match("::",line) & break
X			value ||:= line || "\n"
X		    }
X		    # Note that a key with an empty value returns an
X		    # empty string.
X		    close(intext)
X		    return trim(value, '\n')
X		}
X		else {
X		    if base_value ~= iand(location, base_value_mask)
X		    then fail
X		}
X	    }
X	}
X    }
X
X    # we should have returned by now
X    close(intext)
X    fail
X
Xend
X
SHAR_EOF
true || echo 'restore of bmp2text.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= initfile.icn ==============
if test -f 'initfile.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping initfile.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting initfile.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'initfile.icn' &&
X############################################################################
X#
X#	Name:	 initfile.icn
X#
X#	Title:	 initialize entry for file in filestats table
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.13
X#
X############################################################################
X#
X#  This file contains initfile(filename), which creates a set of stats
X#  for the indexed database contained in filename.  Uses several global
X#  structures, primarily for speed.  Beware.
X#
X############################################################################
X#
X#  See also: retrieve.icn, bmp2text.icn, retrops.icn
X#
X############################################################################
X
X# Used to store stats for each filename.
Xrecord Fs(ind_filename, bmp_filename, lim_filename, IS, ofs_table)
X
X# IS is declared in indexutl.icn.
X# global IS
X
Xglobal filestats
X
Xprocedure initfile(filename)
X
X    # Messy procedure which creates and stores the names of several
X    # files that will be repeatedly used with "filename."  Reads in
X    # the stats for filename from that file's .IS file.  Also reads in
X    # the bitmap->offset (.OFS file) table, and puts it into
X    # filestats[filename].ofs_table for later (re-)use.
X
X    local IS_filename, in_IS, upto_field, stored_bitmap_length,
X	ofs_filename, intext, cut_down_bitmap, block_size, offset
X    # global filestats
X    initial {
X	filestats := table()
X	# OS-specific parameters are initialized here.
X	initialize_os_params()	# in indexutl.icn
X    }
X
X    # Check for sloppy programming.  Did we do this one already??
X    if not (/filestats[filename] := Fs(,,,,table())) then fail
X
X    filestats[filename].ind_filename :=
X	dir_name(filename)||create_fname(filename, "IND")
X    filestats[filename].bmp_filename :=
X	dir_name(filename)||create_fname(filename, "BMP")
X    filestats[filename].lim_filename :=
X	dir_name(filename)||create_fname(filename, "LIM")
X
X    # Decode stored IS record for filename.
X    IS_filename := dir_name(filename)||create_fname(filename, "IS")
X    in_IS := open(IS_filename) | abort("bitmap_2_text",
X	"Can't open "||IS_filename||".  Did you forget to index?", 24)
X    filestats[filename].IS := decode(!in_IS)
X    close(in_IS)
X	
X    # Having decoded IS, we can now determine the length of the cut-
X    # down bitmaps stored in the .OFS file for filename.
X    upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
X    stored_bitmap_length :=
X	((filestats[filename].IS.len * upto_field) <= seq(0,8))
X
X    # open .OFS file
X    ofs_filename := dir_name(filename)||create_fname(filename, "OFS")
X    intext := open(ofs_filename) |
X	abort("bitmap_2_text", "can't open "||ofs_filename, 23)
X    
X    # read in blocks from .OFS file, breaking them into their
X    # constituent parts
X    while block_size := read_int(intext, 8) * 8 do {
X	cut_down_bitmap := read_int(intext, stored_bitmap_length)
X	offset := read_int(intext, block_size - stored_bitmap_length)
X	insert(filestats[filename].ofs_table, cut_down_bitmap, offset)
X    }
X    close(intext)
X
X    return *filestats[filename].ofs_table
X
Xend
SHAR_EOF
true || echo 'restore of initfile.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= retrieve.icn ==============
if test -f 'retrieve.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping retrieve.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting retrieve.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'retrieve.icn' &&
X############################################################################
X#
X#	Name:	 retrieve.icn
X#
X#	Title:	 retrieve locations of words in database file
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.18
X#
X############################################################################
X#
X#  Retrieve(pattern, filename) retrieves all locations containing
X#  words matching pattern (arg1) in filename (arg2), placing them in a
X#  list.  "Locations" are integer-coded pointers to places in filename
X#  where corresponding text is located.  To actually retrieve that
X#  block of text, you must call bitmap_2_text(location, filename).
X#  Retrieve() only gathers up a list of locations in filename
X#  containing words which match pattern.
X#
X#  The reason retrieve() doesn't do the logical thing - namely, to
X#  "retrieve" text itself - is that doing so might use a *lot* of
X#  memory.  It is far more economical to retrieve text only when a
X#  given chunk is requested via bitmap_2_text().
X#
X#  The format for filename must conform to a simple, but strict, set
X#  of guidelines.  Basically, it must interleave a series of keys
X#  (so-called "bitmaps") with actual text:
X#
X#  ::001:001:001
X#  This is text.
X#  ::001:001:002
X#  This is more text.
X#
X#  The lines beginning with :: (a double colon) are the keys.  These
X#  translate into an integer dividable internally into (in this case)
X#  three bit-fields of length 10 (enough to handle 999:999:999), which
X#  serve as a location markers for the text that goes with them.  See
X#  makeind.icn for a precise instructions on how to construct and index
X#  files.
X#
X#  Note:  Patterns must match words in their entirety.  For instance,
X#  retrieve("dog",filename) would only retrieve exact matches for the
X#  word "dog" in filename.  To catch, say, "doggie" as well, it would
X#  be necessary to call retrieve with a regular expression that
X#  matched both dog and doggie (e.g. retrieve("dog.*",filename)).
X#
X############################################################################
X#
X#  Links: codeobj.icn, ./indexutl.icn, ./binsrch.icn, ./initfile.icn
X#         ./findre.icn
X#
X#  See also: makeind.icn, bmp2text.icn
X#
X############################################################################
X
Xlink codeobj
X
X# The following globals contain stats for current file (here, arg2).
X# global filestats    # declared in initfile.icn
X# global IS           # declared in indexutl.icn
X
Xprocedure retrieve(pattern, filename, inverse)
X
X    local bitmap_set, bmp_file, in_egrep, intext, cmd, offset, line
X    static is_UNIX, egrep_filename
X    initial {
X	if is_UNIX := find("UNIX",&features) then
X	    # If egrep is available, use it.  It's fast.
X	    egrep_filename := "egrep"
X	    # egrep_filename := "/usr/local/bin/gnuegrep"
X    }
X
X    # Check for sloppy programming.
X    /filename & abort("retrieve","you called me without a filename",22)
X
X    # Initialize important variables.
X    #
X    if /filestats | /filestats[filename]
X    then initfile(filename)           # see initfile.icn
X    bitmap_set := set()		      # list will contain locations of hits
X    IS := filestats[filename].IS      # re-initialize IS for current file
X    if /IS.is_case_sensitive then
X	pattern := map(pattern)
X
X    # Open bitmap file.
X    #
X    bmp_file := open(filestats[filename].bmp_filename) |
X	abort("retrieve","can't open "||filestats[filename].bmp_filename, 29)
X
X    # Search index.
X    #
X    if are_metas(pattern) then {
X    # NB: are_metas() can be found in indexutl.icn
X
X	# If there are metacharacters in pattern, do a regexp pattern match.
X	# The .IND file goes:  line ::= key \t other-stuff.
X	pattern := "^(" || pattern || ")\t"
X
X	# If UNIX, then use egrep to search index.
X	#
X	if \is_UNIX then {
X
X	    # Set up command line to be passed to /bin/sh.
X	    cmd := egrep_filename || " '" || pattern ||
X		"' " || filestats[filename].ind_filename
X	    # open pipe
X	    in_egrep := open(cmd, "rp") |
X		abort("retrieve","can't open pipe from\n\t"||cmd, 20)
X	    # grep .IND index file
X	    every line := !in_egrep do {
X		line ? (tab(find("\t")+1), offset := integer(tab(0)))
X		bitmap_set ++:=
X		    retrieve_bitmaps(offset, bmp_file)
X	    }
X	    close(in_egrep)
X
X	# ...otherwise (i.e. if not UNIX) use findre() instead of egrep
X	#
X	} else {
X
X	    # Probably MS-DOS or something else.  SLOW, SLOW!
X	    intext := open(filestats[filename].ind_filename) |
X		abort("retrieve","can't open index file", 21)
X	    # grep .IND file
X	    every line := !intext do {
X		line ? {
X		    if findre(pattern) & tab(find("\t")+1) then {
X			offset := integer(tab(0))
X			bitmap_set ++:=
X			    retrieve_bitmaps(offset, bmp_file)
X		    }
X		}
X	    }
X	    every close(bmp_file | intext)
X	}
X
X    # If *not* are_metas(pattern), then do a binary search of the index.
X    # No need to worry about is_UNIX, egrep, findre(), etc.
X    #
X    } else {
X	if offset :=
X	    binary_index_search(pattern, filestats[filename].ind_filename)
X	then bitmap_set ++:=
X	    retrieve_bitmaps(offset, bmp_file)
X    }
X
X    # If inverse (arg 3) is nonnull, then invert the sense of the search.
X    # Do this by knocking out those parts of the full bitmap set that are
X    # in the bitmap_set, and then assigning the result to bitmap_set.
X    #
X    if \inverse then
X	bitmap_set := (all_bitmaps(bmp_file) -- bitmap_set)
X
X    # We're done.  See if there were any hits.  If so, sort & return a
X    # list (lists are easier for the display routines to handle).
X    #
X    close(bmp_file)
X    #
X    if *bitmap_set > 0
X    then return sort(bitmap_set)
X    else fail
X
Xend
X
X
X
Xprocedure retrieve_bitmaps(offset, f, return_a_list)
X
X    local bitmap_list, bitmap_length, i, tmp, how_many_bitmaps,
X	bits_needed, inverse_signal
X
X    bits_needed := 24
X    inverse_signal := 8388608
X    
X    seek(f, offset)
X    bitmap_length := ((IS.len * IS.no) <= seq(0,8))
X    tmp := read_int(f, bits_needed)
X    how_many_bitmaps := iand(inverse_signal-1, tmp)
X
X    # Slower way.
X    # bitmap_list := list(how_many_bitmaps)
X    # every i := 1 to how_many_bitmaps do
X    #     bitmap_list[i] := read_int(f, bitmap_length)
X
X    # Slow way.
X    bitmap_list := list()
X    every i := 1 to how_many_bitmaps do
X	put(bitmap_list, read_int(f, bitmap_length))
X
X    # If the inverse signal bit is turned on, then the BMP file stores
X    # non-occurrences for a given key (rather than occurrences).  Saves
X    # space for a/the/and, etc., but necessitates collecting all bitmaps
X    # for the current file into a set a set difference.  The procedure
X    # all_bitmaps does the collecting.
X    if iand(inverse_signal, tmp) ~= 0 then {
X	bitmap_list := (all_bitmaps(f) -- set(bitmap_list))
X    }
X
X    if \return_a_list
X    then return sort(bitmap_list)
X    else {
X	if type(bitmap_list) == "list"
X	then return set(bitmap_list)
X	else return bitmap_list
X    }
X
Xend
X
X
X
Xprocedure all_bitmaps(f, return_a_list)
X
X    # At offset 1 in the BMP file is the list of all bitmaps in the
X    # full file.  Returns the set of these, unless a list is desired,
X    # in which case one must call all_bitmaps() with a nonnull second
X    # argument.
X    return retrieve_bitmaps(1, f, return_a_list)
X
Xend
SHAR_EOF
true || echo 'restore of retrieve.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= indexutl.icn ==============
if test -f 'indexutl.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping indexutl.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting indexutl.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'indexutl.icn' &&
X############################################################################
X#
X#	Name:	 indexutl.icn
X#
X#	Title:	 indexing utilities
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.19
X#
X############################################################################
X#
X#  This file contains base_name(), dir_name(), get_index_fname(),
X#  stripchars(), abort(), and gettokens().
X#
X#  base_name(s), dir_name(s)	- like the Unix system commands
X#  create_fname(fname,ext)	- get a new filename based on fname + ext
X#  stripchars(s,c)		- strip chars c from string s
X#  abort(proc,msg,ecode)	- abort procedure proc with exit code ecode
X#  write_int(f, int, size)	- breaks int into 8-bit chunks & writes to f
X#  read_int(f, int, size)	- like write_int, only constructs int from f
X#  are_metas(pattern)		- succeeds if pattern has egrep-style metas
X#  digits_2_bitmap(s)		- converts string 01:13:94 to an int-bitmap
X#
X############################################################################
X#
X#  Links: ./findre.icn, radcon.icn, bincvt.icn
X#
X#  See also: retrieve.icn, retrops.icn, bmp2text.icn, makeind.icn
X#
X############################################################################
X
X#
X# All from the IPL.
X#
Xlink radcon, bincvt
X
X#
X# FS = field separator, s_len = string length of fields, len =
X# byte length of fields, no = number of fields, is_case_sensitive =
X# whether to map index entries to lowercase, r_field = rollover
X# field for limits file.
X#
Xrecord is(FS, s_len, len, no, is_case_sensitive, r_field)
Xglobal _slash, _baselen, IS
X
X
Xprocedure base_name(s)
X
X    # If s == "/usr/local/man/man1/icon.1", base_name will return
X    # "icon.1".  Somewhat like the Unix basename system command.
X
X    # global _slash		# _slash = \ for MS-DOS, / for Unix
X    s ? {
X	while tab(find(_slash)+1)
X	return tab(0)
X    }
X
Xend
X
X
X
Xprocedure dir_name(s)
X
X    # If s == "/usr/local/man/man1/icon.1", dir_name will return
X    # "/usr/local/man/man1".  Somewhat like the Unix dirname system
X    # command.
X
X    local s2
X    # global _slash		# _slash = \ for MS-DOS, / for Unix
X
X    s2 := ""
X    s ? {
X	while s2 ||:= tab(find(_slash)+1)
X	return s2
X    }
X
Xend
X
X
X
Xprocedure create_fname(FNAME, EXT)
X
X    #
X    # Discard path component.  Cut basename down to a small enough
X    # size that the OS will be able to handle addition of the ex-
X    # tension, EXT.
X    #
X
X    # global _slash, _baselen
X
X    *EXT > 3 &
X	abort("get_index_fname","extension too long",7)
X
X    return right(
X	stripchars(base_name(FNAME,_slash),'.'), _baselen, "x") ||
X	    "." || EXT
X
Xend
X
X
X
Xprocedure stripchars(s,c)
X
X    # Strip chars (c) from string (s).  Return stripped s.
X
X    local s2
X
X    s2 := ""
X    s ? {
X	while s2 ||:= tab(upto(c))
X	do tab(many(c))
X	s2 ||:= tab(0)
X    }
X    return s2
X
Xend
X
X
X
Xprocedure abort(proc_name, message, error_code)
X
X    if not (/proc_name := "") then
X	proc_name := trim(proc_name, ': ') || ":  "
X    /error_code := 1
X    
X    write(&errout, proc_name, \message) # fail if there's no error msg,
X    exit(error_code)		 # then abort
X
Xend
X
X
X
Xprocedure write_int(f, i, size)
X
X    # Write out an integer byte-by-byte.
X    #
X    # Important little routine.  I know it looks inelegant and slow.
X    # Feel free to modify it for speed, and send me the results.
X    # Don't knock out the old code, though.  You understood it when
X    # you read it, right?  That's the idea :-).
X
X    local marker, how_many
X
X    marker := ""
X    how_many := 0
X    /size := (*exbase10(i,2) <= seq(0,8))
X
X    # output bytes most significant first; then least significant
X    until (size -:= 8) <= -8 do {
X	how_many +:= 1
X	marker ||:= (f, char(iand(ishift(i, -size), 2r11111111)))
X    }
X
X    writes(f, marker)
X    return how_many		# number of characters written
X
Xend
X
X
X
Xprocedure read_int(f, size)
X
X    local i, _shift
X
X    # collect bytes, putting the first one read into the high
X    # end of an integer, and on down to the last read (into the
X    # low end)
X    i := _shift := 0
X    while (_shift +:= 8) <= size do
X	i +:= ishift(ord(reads(f)), size - _shift) | fail
X    return i
X
Xend
X
X
X
Xprocedure initialize_os_params()
X
X    local os
X    # global _slash, _baselen
X
X    if find("MS-DOS", os := &features) then {
X	_slash := "\\";	_baselen := 8
X    }
X    else if find("UNIX", os := &features) then {
X	_slash := "/"; _baselen := 10
X    }
X    else abort("initialize_os_params","os parameters undefined", 6)
X
X    return os
X
Xend
X
X
Xprocedure are_metas(str)
X
X    local chr, tmp
X
X    str ? {
X
X	# String-initial metacharacters are meaningless.
X	tab(many('*+?|'))
X
X	# Look for metacharacters and backslashes.
X	while tab(upto('\\*+()|?.$^[')) do {
X
X	    # If a backslash comes first, then the next character can't
X	    # be a meta.  Move past it, and try again.
X	    if ="\\" then move(1) |
X		abort("are_metas","malformed \-escape sequence",19)
X	    # Otherwise, we have a metacharacter.  Return its position
X	    # in str.  Dereference just so as not to have a global var.
X	    # on the loose.
X	    else return .&pos
X	}
X
X    }
X
X    # If we've gotten this far without returning, then the string is
X    # clean of metacharacters, and (in boolean terms) the procedure
X    # are_metas() returns false.
X    fail
X
Xend
X
X
X#
X# digits_2_bitmap
X#
X# Converts a string representation of a set of bit-fields into an
X# integer.  I.e. 1:1:3 becomes binary 010111 (decimal 23).  This
X# integer is like a map, and is called, in text-processing circles,
X# a bitmap (not to be confused with bit-mapped display techniques).
X# 
Xprocedure digits_2_bitmap(s)
X
X    # s        = location string (e.g. 10:02:03:75)
X    # IS.s_len = the string length of fields in s (3 in the above example)
X    # IS.len   = the number of bits needed to hold an integer
X    #             representation of a single field
X    # IS.no    = number of fields in s (4 in the above example)
X    #
X    # Fixed field lengths make things much simpler, but a whole
X    # helluva lot less economical.  Be sure that (IS.len * IS.no) does
X    # not exceed the register width for your CPU if either a) your
X    # implementation has no limits on the size of integers, or b) you
X    # are really concerned about performance.  Otherwise, never mind.
X
X    local bitmap, field, no
X
X    no       := IS.no
X    bitmap   := 0
X
X    s ? {
X	if upto(~&digits) then {
X	    # The bitmap is delineated by field-markers (e.g. 11;23).
X	    tab(upto(&digits))
X	    while field := tab(many(&digits)) do {
X		no -:= 1
X		tab(upto(&digits))
X		bitmap +:= ishift(field, no * IS.len)
X	    }
X	} else {
X	    # Yuck!  An un-delineated bitmap (e.g. 23423).
X	    while field := integer(move(IS.s_len)) do {
X		no -:= 1
X		tab(upto(&digits))
X		bitmap +:= ishift(field, no * IS.len)
X	    }
X	}
X	# If we're not at the end of the line, then we've got a
X	# a problem with the portion of the input file passed
X	# to digits_2_bitmap as s (arg1).
X	pos(0) | abort("digits_2_bitmap",
X		       "malformed position marker:  "||s,
X		       11)
X    }
X
X    # If the current no is not -1, then we have either too
X    # many or too few fields, i.e. someone wrote, say, 01:02:03 in
X    # a text which he or she declared as having four fields.
X    no = 0 | abort("digits_2_bitmap",
X	no || " fields in "||s||" (expected "||IS.no||")",
X	12)
X    # write(&errout,"bitmap = ",radcon(bitmap,10,2))  # for debugging
X    return bitmap
X
Xend
SHAR_EOF
true || echo 'restore of indexutl.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= retrops.icn ==============
if test -f 'retrops.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping retrops.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting retrops.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'retrops.icn' &&
X############################################################################
X#
X#	Name:	 retrops.icn
X#
X#	Title:	 logical operations for retrievals
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.16
X#
X############################################################################
X#
X#  The following collection of procedures implements logical
X#  and/or/and_not operations for the retrieve text-retrieval package.
X#  Their general form is
X#
X#      r_op(set1, set2, filename, field, range)
X#
X#  where op = one of either and, or, or and_not.  The field and range
X#  arguments are optional.
X#
X#  To illustrate how these operations are performed, let me explain
X#  how one of the procedures below, r_and(), works.  Let us assume we
X#  have retrieve()d bitmap sets for two patterns in a single indexed
X#  file.  Call the sets set1 and set2.  Call the file filename.  These
X#  two sets are passed to r_and() as arguments one and two.  R_and()
X#  takes the intersection of these two sets.  The result is a
X#  collection of all bitmaps pointing to blocks in filename containing
X#  words matching *both* of the two patterns used to generate set1 and
X#  set2.  R_and() returns this result to the calling procedure.
X#
X#  Note that, by default, r_and() retrieves co-ocurrences of patterns
X#  within a single block.  If the programmer wishes to find
X#  co-ocurrences within larger units, he or she may supply a field
X#  argument.  Fields are fixed width bit-fields into which location
X#  markers for filename are divided, numbered from the largest and
X#  most general to the smallest and most specific.  See the file
X#  makeind for a discussion of how they are handled.  A range
X#  parameter may also be specified, which makes it possible to look
X#  for coocurrences in collections of more than one unit of the type
X#  specified in the field argument.
X#
X############################################################################
X#
X#  Links: none
X#
X#  See also: retrieve.icn, makeind.icn
X#
X############################################################################
X
X# The following globals contain stats for current file (here, arg 3).
X# global filestats    # declared in initfile.icn
X# global IS           # declared in indexutl.icn
X
Xprocedure r_or(set1, set2, filename, field, range)
X
X    # Check for sloppy programming.
X    /filename & abort("apply_op", "you gotta call me with a filename", 43)
X    type(set1) == ("list"|"set") |
X	abort("apply_op","arg 1 must be a list/set",45)
X    type(set2) == ("list"|"set") |
X	abort("apply_op","arg 2 must be a list/set",46)
X
X    # Be sure to convert lists to sets.  Personally, I think list -> set
X    # conversions should be as automatic in Icon as their string -> cset
X    # correspondents.
X    type(set1) == "set" | (set1 := set(set1))
X    type(set2) == "set" | (set2 := set(set2))
X
X    # No need to initialize variables.  Field and range are
X    # meaningless for this op.
X    return set1 ++ set2
X
Xend
X
Xprocedure r_and(set1, set2, filename, field, range)
X    # set intersection
X    return apply_op("**", set1, set2, filename, field, range)
SHAR_EOF
true || echo 'restore of retrops.icn failed'
fi
echo 'End of  part 3'
echo 'File retrops.icn is continued in part 4'
echo 4 > _shar_seq_.tmp
exit 0

exit 0 # Just in case...
-- 
Kent Landfield                   INTERNET: kent@sparky.IMD.Sterling.COM
Sterling Software, IMD           UUCP:     uunet!sparky!kent
Phone:    (402) 291-8300         FAX:      (402) 291-4362
Please send comp.sources.misc-related mail to kent@uunet.uu.net.


