Newsgroups: alt.sources From: goer@ellis.uchicago.edu (Richard L. Goerwitz) Subject: kjv browser, part 9 of 11 Message-ID: <1991Jul3.065253.28409@midway.uchicago.edu> Date: Wed, 3 Jul 1991 06:52:53 GMT ---- Cut Here and feed the following to sh ---- #!/bin/sh # this is bibleref.09 (part 9 of a multipart archive) # do not concatenate these parts, unpack them in order with /bin/sh # file makeind.icn continued # if test ! -r _shar_seq_.tmp; then echo 'Please unpack part 1 first!' exit 1 fi (read Scheck if test "$Scheck" != 9; then echo Please unpack part "$Scheck" next! exit 1 else exit 0 fi ) < _shar_seq_.tmp || exit 1 if test ! -f _shar_wnt_.tmp; then echo 'x - still skipping makeind.icn' else echo 'x - continuing file makeind.icn' sed 's/^X//' << 'SHAR_EOF' >> 'makeind.icn' && X # (keys are option letters). X # X usage:= "usage: makeind -f filename -m int -n int [-l int] [-s]" X opt_table := initialize_IS(a) X fname := \opt_table["f"] | stop(usage) X rollover_field := opt_table["l"] # (optional) X X # X # Begin the process of tokenizing, recording token locations, and X # of storing this information in two separate files. X # X # Read input file, making a table of words and their locations. X index_table := create_index(fname) X X # X # Write keys to one file, with pointers into another file X # containing the bitmaps for each key. X # X index_fname := dir_name(fname)||create_fname(fname, "IND") X bitmap_fname := dir_name(fname)||create_fname(fname, "BMP") X write_tokens_and_offsets(index_fname, bitmap_fname, index_table) X X # X # Re-open fname and store the locations for each chunk of text X # marked by a ::location marker. This could certainly be X # incorporated into the indexing routines, but only at the great X # expense of clarity. X # X upto_field := 1 < (IS.no * 2) / 3 | 1 X bofname := dir_name(fname)||create_fname(fname, "OFS") X bitmap_offset_table := X store_bitmaps_and_offsets(fname, upto_field) X # store in .OFS file X write_bitmaps_and_offsets(bofname, bitmap_offset_table, upto_field) X X # X # Re-open fname again, and store the pre-rollover bitmaps in the X # .LIM file. Obviously this procedure could be stuffed into X # another one above (e.g. store_bitmaps_and_offsets()). X # X if \rollover_field then { X # X # Let's say we are using the Bible as our text, and we want to X # create all the bitmaps for Genesis 1:9-2:10. We need to know X # what verse chapter 1 goes up to. By supplying makeind X # with a "-l 3" argument, you are telling it to store this in- X # formation for later use by expandrf(). X # X limits_fname := dir_name(fname)||create_fname(fname, "LIM") X write_limits(limits_fname, fname, rollover_field) X IS.r_field := rollover_field X } X X # X # Write IS record to the .IS file. X # X out_IS := open(dir_name(fname)||create_fname(fname, "IS"), "w") | X abort("makeind","can't open .IS file",2) X writes(out_IS, encode(IS)) X close(out_IS) X X # All is well. Exit with zero status. X exit(0) X Xend X X X# X# initialize_IS X# X# Sets up main parameters for the current index file, such as the X# field separator to be used in tokenizing the file, the string and X# bit lengths of bitmap fields, the number of fields, and the size of X# the actual bitmaps (in bytes) as written to disk (comes out to the X# smallest multiple of eight greater than the field length times the X# field number. The marker length has to be set in the main X# procedure, so initialize_IS leaves it null for now. X# Xprocedure initialize_IS(a) X X local usage, fname, opt_table X # global IS X X usage:="usage: makeind -f filename -m int -n int [-l int] [-s]" X X IS := is() # set up some IS fields X opt_table := options(a, "f:m:n+sS:l+") X 3 <= *opt_table <= 6 | stop(usage) X IS.no := \opt_table["n"] | stop(usage) X IS.FS := \opt_table["S"] | "['.]?[^-0-9A-Za-z']+'?" X IS.is_case_sensitive := opt_table["s"] # normally is &null X X # X # Calculate string representation length for fields, as well as X # the number of bits required for their integer representation. X # I.e. if the opt_table["m"] value is 99, this will take two chars to X # represent as a string ("99"), but 7 binary "digits" to represent X # internally as a base-two integer. X # X IS.s_len := *string(opt_table["m"]) X IS.len := *exbase10(opt_table["m"], 2) X X return opt_table X Xend X X X# X# create_index X# X# Creates a table containing all tokens in the file fname, with the X# set of each token's locations recorded as values for those tokens. X# IS.FS is a nawk-style field separator regular expression. X# If &null, defaults to ~(&digits++&letters). IS.s_len X# is the location marker string-representation field length. Index_ X# stats.len is the number of binary digits needed for an X# integer representation of a given field. IS.no is X# the number of fields. X# Xprocedure create_index(fname) X X local intext, wordtbl, line, bitmap, token X X intext := open(fname) | X abort("create_index","can't open index file, "||fname, 9) X wordtbl := table() X X while line := read(intext) do { X line ? { X if ="::" then { X bitmap := digits_2_bitmap(tab(0)) # in indexutl.icn X } else { X # gettokens() resides in a separate file, gettokens.icn X every token := gettokens(IS.is_case_sensitive) do { X /wordtbl[token] := set() X insert(wordtbl[token], \bitmap) | X abort("create_index","text before location-marker",8) X } X } X } X } X \line | abort("create_index", "empty input file, "||fname, 8) X close(intext) X return wordtbl X Xend X X X# X# write_tokens_and_offsets X# X# Writes to one file a list of all tokens collected from the input X# file, one to a line, followed by a tab, and then a byte offset into X# another file where the bitmaps for that token are kept. X# X# token tab offset X# X# A seek to "offset" in the bitmap file will put you at the start of a X# block of bitmaps. X# Xprocedure write_tokens_and_offsets(index_fname, bitmap_fname, t) X X local outtokens, outbitmaps, index_lst, i, bitmap_length, bitmap X X outtokens := open(index_fname, "w") | X abort("write_tokens_and_offsets","can't open "||index_fname,6) X outbitmaps := open(bitmap_fname, "w") | X abort("write_tokens_and_offsets","can't open "||bitmap_fname,5) X # Calculate the length of bitmaps (must be the smallest multiple of X # 8 >= (IS.len * IS.no)). X bitmap_length := ((IS.len * IS.no) <= seq(0,8)) X index_lst := sort(t, 3) X X every i := 1 to *index_lst-1 by 2 do { X X # Write token to index file with the offset of that token's X # bitmaps in the bitmap file. X write(outtokens, index_lst[i], "\t", where(outbitmaps)) X X # Now write the bitmaps for the above token to the bitmap file. X # First write out the number of bitmaps in this block. Two bytes X # are allotted to hold this count (16 bits). X if *index_lst[i+1] > 65535 then { # just in case X abort("write_tokens_and_offsets", X "too many bitmaps for"||index_lst[i], 16) X } X write_int(outbitmaps, *index_lst[i+1], 16) X # Having written the bitmap count, now write the bitmaps proper X # to the bitmap file. X every write_int(outbitmaps, !index_lst[i+1], bitmap_length) X } X X # Close files. Return number of keys processed (any better ideas??) X every close(outtokens | outbitmaps) X return *index_lst / 2 # return number of keys in index file X Xend X X X X# X# store_bitmaps_and_offsets X# X# Runs through the file called fname, finding all the location X# markers, and recording the offset of the text they precede. Writes X# bitmap : offset pairs to a .ofs file. Note that the full bitmap is X# not stored. Rather only the first upto_field fields are stored. X# Normally upto_field = IS.no - 1. X# Xprocedure store_bitmaps_and_offsets(fname, upto_field) X X local intext, current_location, last_major_division, X major_division, bitmap_offset_table X X intext := open(fname) | X abort("store_bitmaps_and_offsets","can't open "||fname,5) X bitmap_offset_table := table() X X while (current_location := where(intext), line := read(intext)) do { X line ? { X if ="::" then { X major_division := X ishift(digits_2_bitmap(tab(0)), # in indexutl.icn X -((IS.no - upto_field) * IS.len)) X if \last_major_division = major_division then X next X else { X insert( X bitmap_offset_table, major_division, current_location) X last_major_division := major_division X } X } X } X } X X return bitmap_offset_table X Xend X X X# X# write_bitmaps_and_offsets X# X# Does the actual writing of bitmaps and offsets to a file. Receives X# a table of bitmaps cut down to upto_field fields. Shinking the X# bitmaps lessens the size of the resulting file, but requires a bit X# more I/O when it comes time to look something up. X# Xprocedure write_bitmaps_and_offsets(bofname, t, upto_field) X X local outtext, tmp_list, i, offset_length, X block_size, stored_bitmap_length X X outtext := open(bofname, "w") | X abort("write_bitmaps_and_offsets","can't open "||bofname,5) X stored_bitmap_length := ((IS.len * upto_field) <= seq(0,8)) X tmp_list := sort(t, 3) X X every i := 1 to *tmp_list-1 by 2 do { X X # Number of bits needed to hold offset. X offset_length := (*exbase10(tmp_list[i+1], 2) <= seq(0,8)) X # Number of bytes needed to hold bitmap and offset (both). X block_size := (stored_bitmap_length + offset_length) / 8 X X # We could just code the length of the offset, since the bitmap's X # length is fixed (and known). Seems better to code the block's X # total length just in case something gets screwed up. An 8-bit X # limit means the bitmap+offset length cannot exceed 2^9-1 (255) X # characters. X if block_size > 255 then X abort("write_bitmaps_and_offsets","bitmap+offset too big",15) X write_int(outtext, block_size, 8) X write_int(outtext, tmp_list[i], stored_bitmap_length) X write_int(outtext, tmp_list[i+1], offset_length) X X } X X return X Xend X X# X# write_limits X# X# Writes out the bitmaps that will be needed in order for expandrf() X# to be able to know when the rollover field rolls over. X# Xprocedure write_limits(out_fname, in_fname, r_field) X X local in, out, shift_bits_out, bitmap_length, bitmaps_read, X line, bitmap, short_bitmap, old_bitmap X X in := open(in_fname) | X abort("write_limits","can't open "||in_fname,5) X out := open(out_fname, "w") | X abort("write_limits","can't open "||out_fname,5) X r_field <= IS.no | X abort("write_limits","-l value should not exceed that of -n",50) X shift_bits_out := -(((IS.no-r_field)+ 1) * IS.len) X bitmap_length := ((IS.len * IS.no) <= seq(0,8)) X bitmaps_read := 0 X X while line := read(in) do { X line ? { X if ="::" then { X bitmaps_read +:= 1 X bitmap := digits_2_bitmap(tab(0)) # in indexutl.icn X short_bitmap := ishift(bitmap, shift_bits_out) X if ishift(\old_bitmap, shift_bits_out) ~== short_bitmap X then write_int(out, old_bitmap, bitmap_length) X old_bitmap := bitmap X } X } X } X X write_int(out, \old_bitmap, bitmap_length) X every close(in | out) X return bitmaps_read X Xend SHAR_EOF echo 'File makeind.icn is complete' && true || echo 'restore of makeind.icn failed' rm -f _shar_wnt_.tmp fi # ============= gettokens.icn ============== if test -f 'gettokens.icn' -a X"$1" != X"-c"; then echo 'x - skipping gettokens.icn (File already exists)' rm -f _shar_wnt_.tmp else > _shar_wnt_.tmp echo 'x - extracting gettokens.icn (Text)' sed 's/^X//' << 'SHAR_EOF' > 'gettokens.icn' && X############################################################################ X# X# Name: gettokens.icn X# X# Title: get tokens from text-base file X# X# Author: Richard L. Goerwitz X# X# Version: 1.2 X# X############################################################################ X# X# Tokenizing routine used by makeind.icn to create index. X# X############################################################################ X# X# See also: ./makeind.icn X# X############################################################################# X X# declared in ./indexutl.icn (q.v.) X# global IS X# X# One idea for gettokens, good for small indices. Uses field separator X# (IS.FS). Also uses (slow) findre. Farther below is a less flexible X# version of gettokens which runs faster. X# X#procedure gettokens(is_case_sensitive) X# X# # Used within a scanning expression. Returns tokens in X# # &subject[&pos:0] (&pos normally = 1). Tokens are stretches of X# # text separated by the IS.FS field separator. This X# # field separator is a nawk style FS regular expression. If null, X# # it gets defined as ~(&digits++&letters). X# X# local token X# static non_alphanums X# initial non_alphanums := ~(&digits ++ &letters ++ '-') X# X# /IS.FS := non_alphanums X# X# while token := tab(findre(IS.FS)) do { X# tab(__endpoint) X# tab(many('\'')) # unfortunate by-product of findre's weakness X# if \is_case_sensitive X# then suspend "" ~== trim(token,'\t ') X# else suspend map("" ~== trim(token,'\t ')) X# } X# X# # Return the rest of &subject. Even though we're not tabbing X# # upto FS, this is normally what the user intends. X# if \is_case_sensitive X# then return "" ~== trim(tab(0),'\t ') X# else return map("" ~== trim(tab(0),'\t ')) X# X#end X Xprocedure gettokens(is_case_sensitive) X X # Used within a scanning expression. Returns tokens in X # &subject[&pos:0] (&pos normally = 1). Tokens are stretches of X # text separated by an optional apostrophe or dash, then any X # stretch of non-alphanumeric characters, then an optional apos- X # trophe. X X local token X static alphanums, wordchars X initial { X alphanums := &digits ++ &letters ++ '-' X wordchars := alphanums ++ '\'' X } X X tab(upto(alphanums)) X while token := tab(many(wordchars)) do { X if \is_case_sensitive X then suspend "" ~== trim(token,'\t \'-') X else suspend map("" ~== trim(token,'\t \'-')) X tab(upto(alphanums)) X } X Xend SHAR_EOF true || echo 'restore of gettokens.icn failed' rm -f _shar_wnt_.tmp fi # ============= Makefile.dist ============== if test -f 'Makefile.dist' -a X"$1" != X"-c"; then echo 'x - skipping Makefile.dist (File already exists)' rm -f _shar_wnt_.tmp else > _shar_wnt_.tmp echo 'x - extracting Makefile.dist (Text)' sed 's/^X//' << 'SHAR_EOF' > 'Makefile.dist' && X########################################################################## X# X# Makefile.dist for bibleref. X# X########################################################################## X# X# User-modifiable section. Read carefully! You will almost X# certainly have to change some settings here. X# X X# X# Destination directory for binaries; library directory for auxiliary X# files. Owner and group for public executables. Leave the trailing X# slash off of directory names. X# XDESTDIR = /usr/local/bin X# DESTDIR = $(HOME)/bin XLIBDIR = /usr/local/lib/$(PROGNAME) X# LIBDIR = $(HOME)/$(PROGNAME) X# LIBDIR = /usr/local/share/lib/$(PROGNAME) XOWNER = root #bin XGROUP = root #bin X X# X# Name of your icon compiler and compiler flags. X# XICONC = /usr/icon/v8/bin/icont XIFLAGS = -Sc 200 -Si 1000 -Sn 2000 -SF 30 X X# X# Names of KJV files as packaged in the PC-SIG disk set (19 discs). X# Mine were snarfed from helens.stanford.edu (36.0.2.99) as kjv.tar.Z. X# You will need to link these to the current directory. Please don't X# copy them all over, or if you do, be sure to delete them afterwards. X# They aren't needed after you are done indexing. X# XRAWFILES = gen.txt exo.txt lev.txt num.txt deu.txt jos.txt jdg.txt \ X rth.txt sa1.txt sa2.txt ki1.txt ki2.txt ch1.txt ch2.txt \ X ezr.txt neh.txt est.txt job.txt psa.txt pro.txt ecc.txt \ X son.txt isa.txt jer.txt lam.txt eze.txt dan.txt hos.txt \ X joe.txt amo.txt oba.txt jon.txt mic.txt nah.txt hab.txt \ X zep.txt hag.txt zec.txt mal.txt mat.txt mar.txt luk.txt \ X joh.txt act.txt rom.txt co1.txt co2.txt gal.txt eph.txt \ X phi.txt col.txt th1.txt th2.txt ti1.txt ti2.txt tit.txt \ X phm.txt heb.txt jam.txt pe1.txt pe2.txt jo1.txt jo2.txt \ X jo3.txt jud.txt rev.txt X# X# If you have your KJV in a single file, that's fine. Just be sure X# the books are in their correct order (as above), and are in the PC-SIG X# disk-set format. X# RAWFILES = ./kjv.Z X X# X# If you've compressed your KJV file(s), use zcat; otherwise use cat. X# XCAT = cat X# CAT = zcat X X# X# Change these only if you're pretty sure of what you're doing. X# XSHELL = /bin/sh XMAKE = make X X X########################################################################### X# X# Don't change anything below this line. X# X XRTVFILE = kjv.rtv X XCONVERTER = kjv2rtv XCONVERTSRC = $(CONVERTER).icn convertr.icn name2num.icn complete.icn X XINDEXER = makeind XINDEXSRC = $(INDEXER).icn gettokens.icn indexutl.icn X XDUMMY_FILE = index.done XPROGNAME = bibleref X XSEARCHSRC = $(PROGNAME).icn ref2bmap.icn name2num.icn convertb.icn \ X listutil.icn passutil.icn srchutil.icn complete.icn \ X ipause.icn rewrap.icn binsrch.icn bmp2text.icn initfile.icn \ X retrieve.icn indexutl.icn retrops.icn whatnext.icn iolib.icn \ X iscreen.icn findre.icn X Xall: $(DUMMY_FILE) $(PROGNAME) X X$(DUMMY_FILE): X @echo "" X @echo "This may take a while (about 1 minute/MB on a Sun4)." X @echo "" X @sleep 2 X $(ICONC) $(IFLAGS) -o $(CONVERTER) $(CONVERTSRC) X $(CAT) $(RAWFILES) | $(CONVERTER) > $(RTVFILE) X @echo "" X @echo "This may take a long time (c. 20 min./MB on a Sun4)." X @echo "Kids, don't even *think* of trying this at home." X @echo "" X @sleep 2 X $(ICONC) $(IFLAGS) -o $(INDEXER) $(INDEXSRC) X $(INDEXER) -f $(RTVFILE) -m 200 -n 3 -l 3 X touch $(DUMMY_FILE) X X$(PROGNAME): $(SEARCHSRC) X $(ICONC) $(IFLAGS) -o $(PROGNAME) $(SEARCHSRC) X X$(PROGNAME).icn: $(PROGNAME).src X sed "s|/usr/local/lib/bibleref/kjv.rtv|$(LIBDIR)/$(RTVFILE)|" $(PROGNAME).src > $(PROGNAME).icn X X$(CONVERTER): $(CONVERTSRC) X $(ICONC) $(IFLAGS) -o $(CONVERTER) $(CONVERTSRC) X X$(INDEXER): $(INDEXSRC) X $(ICONC) $(IFLAGS) -o $(INDEXER) $(INDEXSRC) X X X########################################################################## X# X# Pseudo-target names (install, clean, clobber) X# X X# Pessimistic assumptions regarding the environment (in particular, X# I don't assume you have the BSD "install" shell script). Xinstall: all X -test -d $(DESTDIR) || mkdir $(DESTDIR) && chmod 755 $(DESTDIR) X cp $(PROGNAME) $(DESTDIR)/$(PROGNAME) X chgrp $(GROUP) $(DESTDIR)/$(PROGNAME) X chown $(OWNER) $(DESTDIR)/$(PROGNAME) X -test -d $(LIBDIR) || mkdir $(LIBDIR) && chmod 755 $(LIBDIR) X mv xxx* $(RTVFILE) $(LIBDIR)/ X chgrp $(GROUP) $(LIBDIR) X chown $(OWNER) $(LIBDIR) X chgrp $(GROUP) $(LIBDIR)/xxx* $(LIBDIR)/$(RTVFILE) X chown $(OWNER) $(LIBDIR)/xxx* $(LIBDIR)/$(RTVFILE) X @echo "" X @echo "Done." X @echo "" X X# X# For storing the pre-indexed files. All that needs to be done here X# is to unpack the archive on another machine, and make $(PROGNAME). X# Xtar: all X tar -cf ./$(PROGNAME).tar $(PROGNAME).src $(DUMMY_FILE) $(AUXILSRC) \ X Makefile.dist README X X# X# Cleanup X# Xclean: X rm -f $(CONVERTER) $(INDEXER) $(PROGNAME) X X# Be careful; use this target, and you'll be back to square one. Xclobber: clean X @echo "Okay, you asked for it." X rm -f $(RAWFILES) xxx*.??? $(RTVFILE) $(DUMMY_FILE) $(PROGNAME).icn SHAR_EOF true || echo 'restore of Makefile.dist failed' rm -f _shar_wnt_.tmp fi # ============= README ============== if test -f 'README' -a X"$1" != X"-c"; then echo 'x - skipping README (File already exists)' rm -f _shar_wnt_.tmp else > _shar_wnt_.tmp echo 'x - extracting README (Text)' sed 's/^X//' << 'SHAR_EOF' > 'README' && X-------- SHAR_EOF true || echo 'restore of README failed' fi echo 'End of part 9' echo 'File README is continued in part 10' echo 10 > _shar_seq_.tmp exit 0 -- -Richard L. Goerwitz goer%sophist@uchicago.bitnet goer@sophist.uchicago.edu rutgers!oddjob!gide!sophist!goer