Newsgroups: alt.sources
From: goer@ellis.uchicago.edu (Richard L. Goerwitz)
Subject: kjv browser, part 9 of 11
Message-ID: <1991Jul3.065253.28409@midway.uchicago.edu>
Date: Wed, 3 Jul 1991 06:52:53 GMT

---- Cut Here and feed the following to sh ----
#!/bin/sh
# this is bibleref.09 (part 9 of a multipart archive)
# do not concatenate these parts, unpack them in order with /bin/sh
# file makeind.icn continued
#
if test ! -r _shar_seq_.tmp; then
	echo 'Please unpack part 1 first!'
	exit 1
fi
(read Scheck
 if test "$Scheck" != 9; then
	echo Please unpack part "$Scheck" next!
	exit 1
 else
	exit 0
 fi
) < _shar_seq_.tmp || exit 1
if test ! -f _shar_wnt_.tmp; then
	echo 'x - still skipping makeind.icn'
else
echo 'x - continuing file makeind.icn'
sed 's/^X//' << 'SHAR_EOF' >> 'makeind.icn' &&
X    # (keys are option letters).
X    #
X    usage:= "usage: makeind -f filename -m int -n int [-l int] [-s]"
X    opt_table := initialize_IS(a)
X    fname := \opt_table["f"]				| stop(usage)
X    rollover_field := opt_table["l"]			# (optional)
X
X    #
X    # Begin the process of tokenizing, recording token locations, and
X    # of storing this information in two separate files.
X    #
X    # Read input file, making a table of words and their locations.
X    index_table  := create_index(fname)
X
X    #
X    # Write keys to one file, with pointers into another file
X    # containing the bitmaps for each key.
X    #
X    index_fname  := dir_name(fname)||create_fname(fname, "IND")
X    bitmap_fname := dir_name(fname)||create_fname(fname, "BMP")
X    write_tokens_and_offsets(index_fname, bitmap_fname, index_table)
X
X    #
X    # Re-open fname and store the locations for each chunk of text
X    # marked by a ::location marker.  This could certainly be
X    # incorporated into the indexing routines, but only at the great
X    # expense of clarity.
X    #
X    upto_field := 1 < (IS.no * 2) / 3 | 1
X    bofname := dir_name(fname)||create_fname(fname, "OFS")
X    bitmap_offset_table := 
X	store_bitmaps_and_offsets(fname, upto_field)
X    # store in .OFS file
X    write_bitmaps_and_offsets(bofname, bitmap_offset_table, upto_field)
X
X    #
X    # Re-open fname again, and store the pre-rollover bitmaps in the
X    # .LIM file.  Obviously this procedure could be stuffed into
X    # another one above (e.g. store_bitmaps_and_offsets()).
X    #
X    if \rollover_field then {
X	#
X	# Let's say we are using the Bible as our text, and we want to
X	# create all the bitmaps for Genesis 1:9-2:10.  We need to know
X	# what verse chapter 1 goes up to.  By supplying makeind
X	# with a "-l 3" argument, you are telling it to store this in-
X	# formation for later use by expandrf().
X	#
X	limits_fname := dir_name(fname)||create_fname(fname, "LIM")
X	write_limits(limits_fname, fname, rollover_field)
X	IS.r_field := rollover_field
X    }
X
X    #
X    # Write IS record to the .IS file.
X    #
X    out_IS := open(dir_name(fname)||create_fname(fname, "IS"), "w") |
X	abort("makeind","can't open .IS file",2)
X    writes(out_IS, encode(IS))
X    close(out_IS)
X
X    # All is well.  Exit with zero status.
X    exit(0)
X
Xend
X
X
X#
X# initialize_IS
X#
X# Sets up main parameters for the current index file, such as the
X# field separator to be used in tokenizing the file, the string and
X# bit lengths of bitmap fields, the number of fields, and the size of
X# the actual bitmaps (in bytes) as written to disk (comes out to the
X# smallest multiple of eight greater than the field length times the
X# field number.  The marker length has to be set in the main
X# procedure, so initialize_IS leaves it null for now.
X#
Xprocedure initialize_IS(a)
X
X    local usage, fname, opt_table
X    # global IS
X
X    usage:="usage: makeind -f filename -m int -n int [-l int] [-s]"
X
X    IS := is()			# set up some IS fields
X    opt_table := options(a, "f:m:n+sS:l+")
X    3 <= *opt_table <= 6			| stop(usage)
X    IS.no := \opt_table["n"]			| stop(usage)
X    IS.FS := \opt_table["S"] | "['.]?[^-0-9A-Za-z']+'?"
X    IS.is_case_sensitive := opt_table["s"]      # normally is &null
X
X    #
X    # Calculate string representation length for fields, as well as
X    # the number of bits required for their integer representation.
X    # I.e. if the opt_table["m"] value is 99, this will take two chars to
X    # represent as a string ("99"), but 7 binary "digits" to represent
X    # internally as a base-two integer.
X    #
X    IS.s_len := *string(opt_table["m"])
X    IS.len := *exbase10(opt_table["m"], 2)
X
X    return opt_table
X
Xend
X
X
X#
X# create_index
X#
X# Creates a table containing all tokens in the file fname, with the
X# set of each token's locations recorded as values for those tokens.
X# IS.FS is a nawk-style field separator regular expression.
X# If &null, defaults to ~(&digits++&letters).  IS.s_len
X# is the location marker string-representation field length.  Index_
X# stats.len is the number of binary digits needed for an
X# integer representation of a given field.  IS.no is
X# the number of fields.
X# 
Xprocedure create_index(fname)
X
X    local intext, wordtbl, line, bitmap, token
X
X    intext := open(fname) |
X	abort("create_index","can't open index file, "||fname, 9)
X    wordtbl := table()
X
X    while line := read(intext) do {
X	line ? {
X	    if ="::" then {
X		bitmap := digits_2_bitmap(tab(0)) # in indexutl.icn
X	    } else {
X		# gettokens() resides in a separate file, gettokens.icn
X		every token := gettokens(IS.is_case_sensitive) do {
X		    /wordtbl[token] := set()
X		    insert(wordtbl[token], \bitmap) |
X			abort("create_index","text before location-marker",8)
X		}
X	    }
X	}
X    }
X    \line | abort("create_index", "empty input file, "||fname, 8)
X    close(intext)
X    return wordtbl
X
Xend
X
X
X#
X# write_tokens_and_offsets
X#
X# Writes to one file a list of all tokens collected from the input
X# file, one to a line, followed by a tab, and then a byte offset into
X# another file where the bitmaps for that token are kept.
X#
X#     token tab offset
X#
X# A seek to "offset" in the bitmap file will put you at the start of a
X# block of bitmaps.
X#
Xprocedure write_tokens_and_offsets(index_fname, bitmap_fname, t)
X
X    local outtokens, outbitmaps, index_lst, i, bitmap_length, bitmap
X
X    outtokens := open(index_fname, "w") |
X	abort("write_tokens_and_offsets","can't open "||index_fname,6)
X    outbitmaps := open(bitmap_fname, "w") |
X	abort("write_tokens_and_offsets","can't open "||bitmap_fname,5)
X    # Calculate the length of bitmaps (must be the smallest multiple of
X    # 8 >= (IS.len * IS.no)).
X    bitmap_length := ((IS.len * IS.no) <= seq(0,8))
X    index_lst := sort(t, 3)
X
X    every i := 1 to *index_lst-1 by 2 do {
X
X	# Write token to index file with the offset of that token's
X	# bitmaps in the bitmap file.
X	write(outtokens, index_lst[i], "\t", where(outbitmaps))
X
X	# Now write the bitmaps for the above token to the bitmap file.
X	# First write out the number of bitmaps in this block.  Two bytes
X	# are allotted to hold this count (16 bits).
X	if *index_lst[i+1] > 65535 then {        # just in case
X	    abort("write_tokens_and_offsets",
X		  "too many bitmaps for"||index_lst[i], 16)
X	}
X	write_int(outbitmaps, *index_lst[i+1], 16)
X	# Having written the bitmap count, now write the bitmaps proper
X	# to the bitmap file.
X	every write_int(outbitmaps, !index_lst[i+1], bitmap_length)
X    }
X
X    # Close files.  Return number of keys processed (any better ideas??)
X    every close(outtokens | outbitmaps)
X    return *index_lst / 2	# return number of keys in index file
X
Xend
X
X
X
X#
X# store_bitmaps_and_offsets
X#
X# Runs through the file called fname, finding all the location
X# markers, and recording the offset of the text they precede.  Writes
X# bitmap : offset pairs to a .ofs file.  Note that the full bitmap is
X# not stored.  Rather only the first upto_field fields are stored.
X# Normally upto_field = IS.no - 1.
X#
Xprocedure store_bitmaps_and_offsets(fname, upto_field)
X
X    local intext, current_location, last_major_division,
X	major_division, bitmap_offset_table
X
X    intext := open(fname) |
X	abort("store_bitmaps_and_offsets","can't open "||fname,5)
X    bitmap_offset_table := table()
X
X    while (current_location := where(intext), line := read(intext)) do {
X	line ? {
X	    if ="::" then {
X		major_division := 
X		    ishift(digits_2_bitmap(tab(0)), # in indexutl.icn
X			   -((IS.no - upto_field) * IS.len))
X		if \last_major_division = major_division then
X		    next
X		else {
X		    insert(
X			bitmap_offset_table, major_division, current_location)
X		    last_major_division := major_division
X		}	    
X	    }
X	}
X    }
X    
X    return bitmap_offset_table
X
Xend
X
X
X#
X# write_bitmaps_and_offsets
X#
X# Does the actual writing of bitmaps and offsets to a file.  Receives
X# a table of bitmaps cut down to upto_field fields.  Shinking the
X# bitmaps lessens the size of the resulting file, but requires a bit
X# more I/O when it comes time to look something up.
X#
Xprocedure write_bitmaps_and_offsets(bofname, t, upto_field)
X
X    local outtext, tmp_list, i, offset_length,
X	block_size, stored_bitmap_length
X
X    outtext := open(bofname, "w") |
X	abort("write_bitmaps_and_offsets","can't open "||bofname,5)
X    stored_bitmap_length := ((IS.len * upto_field) <= seq(0,8))
X    tmp_list := sort(t, 3)
X
X    every i := 1 to *tmp_list-1 by 2 do {
X
X	# Number of bits needed to hold offset.
X	offset_length := (*exbase10(tmp_list[i+1], 2) <= seq(0,8))
X	# Number of bytes needed to hold bitmap and offset (both).
X	block_size := (stored_bitmap_length + offset_length) / 8
X
X	# We could just code the length of the offset, since the bitmap's
X	# length is fixed (and known).  Seems better to code the block's
X	# total length just in case something gets screwed up.  An 8-bit
X	# limit means the bitmap+offset length cannot exceed 2^9-1 (255)
X	# characters.
X	if block_size > 255 then
X	    abort("write_bitmaps_and_offsets","bitmap+offset too big",15)
X	write_int(outtext, block_size, 8)
X	write_int(outtext, tmp_list[i], stored_bitmap_length)
X	write_int(outtext, tmp_list[i+1], offset_length)
X
X    }
X
X    return
X
Xend
X
X#
X# write_limits
X#
X# Writes out the bitmaps that will be needed in order for expandrf()
X# to be able to know when the rollover field rolls over.
X#
Xprocedure  write_limits(out_fname, in_fname, r_field)
X
X    local in, out, shift_bits_out, bitmap_length, bitmaps_read,
X	 line, bitmap, short_bitmap, old_bitmap
X
X    in := open(in_fname) |
X	abort("write_limits","can't open "||in_fname,5)
X    out := open(out_fname, "w") |
X	abort("write_limits","can't open "||out_fname,5)
X    r_field <= IS.no |
X	abort("write_limits","-l value should not exceed that of -n",50)
X    shift_bits_out := -(((IS.no-r_field)+ 1) * IS.len)
X    bitmap_length := ((IS.len * IS.no) <= seq(0,8))
X    bitmaps_read := 0
X
X    while line := read(in) do {
X	line ? {
X	    if ="::" then {
X		bitmaps_read +:= 1
X		bitmap := digits_2_bitmap(tab(0)) # in indexutl.icn
X		short_bitmap := ishift(bitmap, shift_bits_out)
X		if ishift(\old_bitmap, shift_bits_out) ~== short_bitmap
X		then write_int(out, old_bitmap, bitmap_length)
X		old_bitmap := bitmap
X	    }
X	}
X    }
X
X    write_int(out, \old_bitmap, bitmap_length)
X    every close(in | out)
X    return bitmaps_read
X		    
Xend
SHAR_EOF
echo 'File makeind.icn is complete' &&
true || echo 'restore of makeind.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= gettokens.icn ==============
if test -f 'gettokens.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping gettokens.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting gettokens.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'gettokens.icn' &&
X############################################################################
X#
X#	Name:	 gettokens.icn
X#
X#	Title:	 get tokens from text-base file
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.2
X#
X############################################################################
X#
X#  Tokenizing routine used by makeind.icn to create index.
X#
X############################################################################
X#
X#  See also: ./makeind.icn
X#
X#############################################################################
X
X# declared in ./indexutl.icn (q.v.)
X# global IS
X#
X# One idea for gettokens, good for small indices.  Uses field separator
X# (IS.FS).  Also uses (slow) findre.  Farther below is a less flexible
X# version of gettokens which runs faster.
X#
X#procedure gettokens(is_case_sensitive)
X#
X#    # Used within a scanning expression.  Returns tokens in
X#    # &subject[&pos:0] (&pos normally = 1).  Tokens are stretches of
X#    # text separated by the IS.FS field separator.  This
X#    # field separator is a nawk style FS regular expression.  If null,
X#    # it gets defined as ~(&digits++&letters).
X#
X#    local token
X#    static non_alphanums
X#    initial non_alphanums := ~(&digits ++ &letters ++ '-')
X#
X#    /IS.FS := non_alphanums
X#    
X#    while token := tab(findre(IS.FS)) do {
X#	tab(__endpoint)
X#	tab(many('\''))		# unfortunate by-product of findre's weakness
X#	if \is_case_sensitive
X#	then suspend "" ~== trim(token,'\t ')
X#	else suspend map("" ~== trim(token,'\t '))
X#    }
X#
X#    # Return the rest of &subject.  Even though we're not tabbing
X#    # upto FS, this is normally what the user intends.
X#    if \is_case_sensitive
X#    then return "" ~== trim(tab(0),'\t ')
X#    else return map("" ~== trim(tab(0),'\t '))
X#
X#end
X
Xprocedure gettokens(is_case_sensitive)
X
X    # Used within a scanning expression.  Returns tokens in
X    # &subject[&pos:0] (&pos normally = 1).  Tokens are stretches of
X    # text separated by an optional apostrophe or dash, then any
X    # stretch of non-alphanumeric characters, then an optional apos-
X    # trophe.
X
X    local token
X    static alphanums, wordchars
X    initial {
X	alphanums := &digits ++ &letters ++ '-'
X	wordchars := alphanums ++ '\''
X    }
X
X    tab(upto(alphanums))
X    while token := tab(many(wordchars)) do {
X	if \is_case_sensitive
X	then suspend "" ~== trim(token,'\t \'-')
X	else suspend map("" ~== trim(token,'\t \'-'))
X	tab(upto(alphanums))
X    }
X
Xend
SHAR_EOF
true || echo 'restore of gettokens.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= Makefile.dist ==============
if test -f 'Makefile.dist' -a X"$1" != X"-c"; then
	echo 'x - skipping Makefile.dist (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting Makefile.dist (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'Makefile.dist' &&
X##########################################################################
X#
X#  Makefile.dist for bibleref.
X#
X##########################################################################
X#
X#  User-modifiable section.  Read carefully!  You will almost
X#  certainly have to change some settings here.
X#
X
X#
X# Destination directory for binaries; library directory for auxiliary
X# files.  Owner and group for public executables.  Leave the trailing
X# slash off of directory names.
X#
XDESTDIR = /usr/local/bin
X# DESTDIR = $(HOME)/bin
XLIBDIR = /usr/local/lib/$(PROGNAME)
X# LIBDIR = $(HOME)/$(PROGNAME)
X# LIBDIR = /usr/local/share/lib/$(PROGNAME)
XOWNER = root #bin
XGROUP = root #bin
X
X#
X# Name of your icon compiler and compiler flags.
X#
XICONC = /usr/icon/v8/bin/icont
XIFLAGS = -Sc 200 -Si 1000 -Sn 2000 -SF 30
X
X#
X# Names of KJV files as packaged in the PC-SIG disk set (19 discs).
X# Mine were snarfed from helens.stanford.edu (36.0.2.99) as kjv.tar.Z.
X# You will need to link these to the current directory.  Please don't
X# copy them all over, or if you do, be sure to delete them afterwards.
X# They aren't needed after you are done indexing.
X#
XRAWFILES = gen.txt exo.txt lev.txt num.txt deu.txt jos.txt jdg.txt \
X	rth.txt sa1.txt sa2.txt ki1.txt ki2.txt ch1.txt ch2.txt \
X	ezr.txt neh.txt est.txt job.txt psa.txt pro.txt ecc.txt \
X	son.txt isa.txt jer.txt lam.txt eze.txt dan.txt hos.txt \
X	joe.txt amo.txt oba.txt jon.txt mic.txt nah.txt hab.txt \
X	zep.txt hag.txt zec.txt mal.txt mat.txt mar.txt luk.txt \
X	joh.txt act.txt rom.txt co1.txt co2.txt gal.txt eph.txt \
X	phi.txt col.txt th1.txt th2.txt ti1.txt ti2.txt tit.txt \
X	phm.txt heb.txt jam.txt pe1.txt pe2.txt jo1.txt jo2.txt \
X	jo3.txt jud.txt rev.txt
X#
X# If you have your KJV in a single file, that's fine.  Just be sure
X# the books are in their correct order (as above), and are in the PC-SIG
X# disk-set format.
X# RAWFILES = ./kjv.Z
X
X#
X# If you've compressed your KJV file(s), use zcat; otherwise use cat.
X#
XCAT = cat
X# CAT = zcat
X
X#
X# Change these only if you're pretty sure of what you're doing.
X#
XSHELL = /bin/sh
XMAKE = make
X
X
X###########################################################################
X#
X#  Don't change anything below this line.
X#
X
XRTVFILE = kjv.rtv
X
XCONVERTER = kjv2rtv
XCONVERTSRC = $(CONVERTER).icn convertr.icn name2num.icn complete.icn
X
XINDEXER = makeind
XINDEXSRC = $(INDEXER).icn gettokens.icn indexutl.icn
X
XDUMMY_FILE = index.done
XPROGNAME = bibleref
X
XSEARCHSRC = $(PROGNAME).icn ref2bmap.icn name2num.icn convertb.icn \
X	listutil.icn passutil.icn srchutil.icn complete.icn \
X	ipause.icn rewrap.icn binsrch.icn bmp2text.icn initfile.icn \
X	retrieve.icn indexutl.icn retrops.icn whatnext.icn iolib.icn \
X	iscreen.icn findre.icn
X
Xall: $(DUMMY_FILE) $(PROGNAME)
X
X$(DUMMY_FILE):
X	@echo ""
X	@echo "This may take a while (about 1 minute/MB on a Sun4)."
X	@echo ""
X	@sleep 2
X	$(ICONC) $(IFLAGS) -o $(CONVERTER) $(CONVERTSRC)
X	$(CAT) $(RAWFILES) | $(CONVERTER) > $(RTVFILE)
X	@echo ""
X	@echo "This may take a long time (c. 20 min./MB on a Sun4)."
X	@echo "Kids, don't even *think* of trying this at home."
X	@echo ""
X	@sleep 2
X	$(ICONC) $(IFLAGS) -o $(INDEXER) $(INDEXSRC)
X	$(INDEXER) -f $(RTVFILE) -m 200 -n 3 -l 3
X	touch $(DUMMY_FILE)
X
X$(PROGNAME): $(SEARCHSRC)
X	$(ICONC) $(IFLAGS) -o $(PROGNAME) $(SEARCHSRC)
X
X$(PROGNAME).icn: $(PROGNAME).src
X	sed "s|/usr/local/lib/bibleref/kjv.rtv|$(LIBDIR)/$(RTVFILE)|" $(PROGNAME).src > $(PROGNAME).icn
X
X$(CONVERTER): $(CONVERTSRC)
X	$(ICONC) $(IFLAGS) -o $(CONVERTER) $(CONVERTSRC)
X
X$(INDEXER): $(INDEXSRC)
X	$(ICONC) $(IFLAGS) -o $(INDEXER) $(INDEXSRC)
X
X
X##########################################################################
X#
X#  Pseudo-target names (install, clean, clobber)
X#
X
X# Pessimistic assumptions regarding the environment (in particular,
X# I don't assume you have the BSD "install" shell script).
Xinstall: all
X	-test -d $(DESTDIR) || mkdir $(DESTDIR) && chmod 755 $(DESTDIR)
X	cp $(PROGNAME) $(DESTDIR)/$(PROGNAME)
X	chgrp $(GROUP) $(DESTDIR)/$(PROGNAME)
X	chown $(OWNER) $(DESTDIR)/$(PROGNAME)
X	-test -d $(LIBDIR) || mkdir $(LIBDIR) && chmod 755 $(LIBDIR)
X	mv xxx* $(RTVFILE) $(LIBDIR)/
X	chgrp $(GROUP) $(LIBDIR)
X	chown $(OWNER) $(LIBDIR)
X	chgrp $(GROUP) $(LIBDIR)/xxx* $(LIBDIR)/$(RTVFILE)
X	chown $(OWNER) $(LIBDIR)/xxx* $(LIBDIR)/$(RTVFILE)
X	@echo ""
X	@echo "Done."
X	@echo ""
X
X#
X# For storing the pre-indexed files.  All that needs to be done here
X# is to unpack the archive on another machine, and make $(PROGNAME).
X#
Xtar: all
X	tar -cf ./$(PROGNAME).tar $(PROGNAME).src $(DUMMY_FILE) $(AUXILSRC) \
X		Makefile.dist README
X
X#
X# Cleanup
X#
Xclean:
X	rm -f $(CONVERTER) $(INDEXER) $(PROGNAME)
X
X# Be careful; use this target, and you'll be back to square one.
Xclobber: clean
X	@echo "Okay, you asked for it."
X	rm -f $(RAWFILES) xxx*.??? $(RTVFILE) $(DUMMY_FILE) $(PROGNAME).icn
SHAR_EOF
true || echo 'restore of Makefile.dist failed'
rm -f _shar_wnt_.tmp
fi
# ============= README ==============
if test -f 'README' -a X"$1" != X"-c"; then
	echo 'x - skipping README (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting README (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'README' &&
X--------
SHAR_EOF
true || echo 'restore of README failed'
fi
echo 'End of  part 9'
echo 'File README is continued in part 10'
echo 10 > _shar_seq_.tmp
exit 0
-- 

   -Richard L. Goerwitz              goer%sophist@uchicago.bitnet
   goer@sophist.uchicago.edu         rutgers!oddjob!gide!sophist!goer