Newsgroups: comp.sources.misc
From: goer@midway.uchicago.edu (Richard L. Goerwitz)
Subject:  v23i069:  quranref - Holy Qur'an word and passage based retrievals, Part03/08
Message-ID: <1991Oct19.022243.12852@sparky.imd.sterling.com>
X-Md4-Signature: c3c0b2565f4fdb2b1b848a6961d6826c
Date: Sat, 19 Oct 1991 02:22:43 GMT
Approved: kent@sparky.imd.sterling.com

Submitted-by: goer@midway.uchicago.edu (Richard L. Goerwitz)
Posting-number: Volume 23, Issue 69
Archive-name: quranref/part03
Environment: Icon

---- Cut Here and feed the following to sh ----
#!/bin/sh
# this is quranref.03 (part 3 of a multipart archive)
# do not concatenate these parts, unpack them in order with /bin/sh
# file inbits.icn continued
#
if test ! -r _shar_seq_.tmp; then
	echo 'Please unpack part 1 first!'
	exit 1
fi
(read Scheck
 if test "$Scheck" != 3; then
	echo Please unpack part "$Scheck" next!
	exit 1
 else
	exit 0
 fi
) < _shar_seq_.tmp || exit 1
if test ! -f _shar_wnt_.tmp; then
	echo 'x - still skipping inbits.icn'
else
echo 'x - continuing file inbits.icn'
sed 's/^X//' << 'SHAR_EOF' >> 'inbits.icn' &&
X	byte_length := 8
X    }
X
X    old_byte_mask := (0 < 2^old_len - 1) | 0
X    old_byte := iand(old_byte, old_byte_mask)
X    i := ishift(old_byte, len-old_len)
X
X    len -:= (len > old_len) | {
X	old_len -:= len
X	return i
X    }
X    
X    while byte := ord(reads(f)) do {
X	i := ior(i, ishift(byte, len-byte_length))
X	len -:= (len > byte_length) | {
X	    old_len := byte_length-len
X	    old_byte := byte
X	    return i
X	}
X    }
X
Xend
SHAR_EOF
echo 'File inbits.icn is complete' &&
true || echo 'restore of inbits.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= rewrap.icn ==============
if test -f 'rewrap.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping rewrap.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting rewrap.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'rewrap.icn' &&
X############################################################################
X#
X#	Name:	 rewrap.icn
X#
X#	Title:	 advanced line rewrap utility
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.4
X#
X############################################################################
X#
X#  The procedure rewrap(s,i), included in this file, reformats text
X#  fed to it into strings < i in length.  Rewrap utilizes a static
X#  buffer, so it can be called repeatedly with different s arguments,
X#  and still produce homogenous output.  This buffer is flushed by
X#  calling rewrap with a null first argument.  The default for
X#  argument 2 (i) is 70.
X#
X#  Here's a simple example of how rewrap could be used.  The following
X#  program reads the standard input, producing fully rewrapped output.
X#
X#  procedure main()
X#      every write(rewrap(!&input))
X#      write(rewrap())
X#  end
X#
X#  Naturally, in practice you would want to do things like check for in-
X#  dentation or blank lines in order to wrap only on a paragraph-by para-
X#  graph basis, as in
X#
X#  procedure main()
X#      while line := read(&input) do {
X#          if line == "" then {
X#              write("" ~== rewrap())
X#              write(line)
X#          } else {
X#              if match("\t", line) then {
X#                  write(rewrap())
X#                  write(rewrap(line))
X#              } else {
X#                  write(rewrap(line))
X#              }
X#          }
X#      }
X#  end
X#
X#  Fill-prefixes can be implemented simply by prepending them to the
X#  output of rewrap:
X#
X#      i := 70; fill_prefix := " > "
X#      while line := read(input_file) do {
X#          line ?:= (f_bit := tab(many('> ')) | "", tab(0))
X#          write(fill_prefix || f_bit || rewrap(line, i - *fill_prefix))
X#          etc.
X#
X#  Obviously, these examples are fairly simplistic.  Putting them to
X#  actual use would certainly require a few environment-specific
X#  modifications and/or extensions.  Still, I hope they offer some
X#  indication of the kinds of applications rewrap might be used in.
X# 
X#  Note:  If you want leading and trailing tabs removed, map them to
X#  spaces first.  Rewrap only fools with spaces, leaving tabs intact.
X#  This can be changed easily enough, by running its input through the
X#  Icon detab() function.
X#
X############################################################################
X#
X#  See also:  wrap.icn
X#
X############################################################################
X
X
Xprocedure rewrap(s,i)
X
X    local extra_bit, line
X    static old_line
X    initial old_line := ""
X
X    # Default column to wrap on is 70.
X    /i := 70
X    # Flush buffer on null first argument.
X    if /s then {
X	extra_bit := old_line
X	old_line := ""
X	return "" ~== extra_bit
X    }
X
X    # Prepend to s anything that is in the buffer (leftovers from the last s).
X    s ?:= { tab(many(' ')); old_line || trim(tab(0)) }
X
X    # If the line isn't long enough, just add everything to old_line.
X    if *s < i then old_line := s || " " & fail
X
X    s ? {
X
X	# While it is possible to find places to break s, do so.
X	while any(' -',line := EndToFront(i),-1) do {
X	    # Clean up and suspend the last piece of s tabbed over.
X	    line ?:= (tab(many(' ')), trim(tab(0)))
X            if *&subject - &pos + *line > i
X	    then suspend line
X	    else {
X		old_line := ""
X		return line || tab(0)
X	    }
X	}
X
X	# Keep the extra section of s in a buffer.
X	old_line := tab(0)
X
X	# If the reason the remaining section of s was unrewrapable was
X	# that it was too long, and couldn't be broken up, then just return
X	# the thing as-is.
X	if *old_line > i then {
X	    old_line ? {
X		if extra_bit := tab(upto(' -')+1) || (tab(many(' ')) | "")
X		then old_line := tab(0)
X		else extra_bit := old_line & old_line := ""
X		return trim(extra_bit)
X	    }
X	}
X	# Otherwise, clean up the buffer for prepending to the next s.
X	else {
X	    # If old_line is blank, then don't mess with it.  Otherwise,
X	    # add whatever is needed in order to link it with the next s.
X	    if old_line ~== "" then {
X		# If old_line ends in a dash, then there's no need to add a
X		# space to it.
X		if old_line[-1] ~== "-"
X		then old_line ||:= " "
X	    }
X	}
X    }
X    
Xend
X
X
X
Xprocedure EndToFront(i)
X    # Goes with rewrap(s,i)
X    *&subject+1 - &pos >= i | fail
X    suspend &subject[.&pos:&pos <- &pos+i to &pos by -1]
Xend
SHAR_EOF
true || echo 'restore of rewrap.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= findre.icn ==============
if test -f 'findre.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping findre.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting findre.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'findre.icn' &&
X########################################################################
X#    
X#	Name:	findre.icn
X#	
X#	Title:	"Find" Regular Expression
X#	
X#	Author:	Richard L. Goerwitz
X#
X#	Version: 1.17
X#
X########################################################################
X#
X#  I place this and any later versions in the public domain - RLG.
X#
X########################################################################
X#
X#  DESCRIPTION:  findre() is like the Icon builtin function find(),
X#  except that it takes, as its first argument, a regular expression
X#  pretty much like the ones the Unix egrep command uses (the few
X#  minor differences are listed below).  Its syntax is the same as
X#  find's (i.e. findre(s1,s2,i,j)), with the exception that a no-
X#  argument invocation wipes out all static structures utilized by
X#  findre, and then forces a garbage collection.
X#
X#  (For those not familiar with regular expressions and the Unix egrep
X#  command: findre() offers a simple and compact wildcard-based search
X#  system.  If you do a lot of searches through text files, or write
X#  programs which do searches based on user input, then findre is a
X#  utility you might want to look over.)
X#
X#  IMPORTANT DIFFERENCES between find and findre:  As noted above,
X#  findre() is just a find() function that takes a regular expression
X#  as its first argument.  One major problem with this setup is that
X#  it leaves the user with no easy way to tab past a matched
X#  substring, as with
X# 
X#	s ? write(tab(find("hello")+5))
X#
X#  In order to remedy this intrinsic deficiency, findre() sets the
X#  global variable __endpoint to the first position after any given
X#  match occurs.  Use this variable with great care, preferably
X#  assigning its value to some other variable immediately after the
X#  match (for example, findre("hello [.?!]*",s) & tmp := __endpoint).
X#  Otherwise, you will certainly run into trouble.  (See the example
X#  below for an illustration of how __endpoint is used).
X#
X#  IMPORTANT DIFFERENCES between egrep and findre:  findre utilizes
X#  the same basic language as egrep.  The only big difference is that
X#  findre uses intrinsic Icon data structures and escaping conven-
X#  tions rather than those of any particular Unix variant.  Be care-
X#  ful!  If you put findre("\(hello\)",s) into your source file,
X#  findre will treat it just like findre("(hello)",s).  If, however,
X#  you enter '\(hello\)' at run-time (via, say, findre(!&input,s)),
X#  what Icon receives will depend on your operating system (most
X#  likely, a trace will show "\\(hello\\)").
X#
X#  BUGS:  Space has essentially been conserved at the expense of time
X#  in the automata produced by findre().  The algorithm, in other
X#  words, will produce the equivalent of a pushdown automaton under
X#  certain circumstances, rather than strive (at the expense of space)
X#  for full determinism.  I tried to make up a nfa -> dfa converter
X#  that would only create that portion of the dfa it needed to accept
X#  or reject a string, but the resulting automaton was actually quite
X#  slow (if anyone can think of a way to do this in Icon, and keep it
X#  small and fast, please let us all know about it).  Note that under
X#  version 8 of Icon, findre takes up negligible storage space, due to
X#  the much improved hashing algorithm.  I have not tested it under
X#  version 7, but I would expect it to use up quite a bit more space
X#  in that environment.
X#
X#  IMPORTANT NOTE:  Findre takes a shortest-possible-match approach
X#  to regular expressions.  In other words, if you look for "a*",
X#  findre will not even bother looking for an "a."  It will just match
X#  the empty string.  Without this feature, findre would perform a bit
X#  more slowly.  The problem with such an approach is that often the
X#  user will want to tab past the longest possible string of matched
X#  characters (say tab((findre("a*|b*"), __endpoint)).  In circumstan-
X#  ces like this, please just use something like:
X#
X#      s ? {
X#          tab(find("a")) &  # or use Arb() from the IPL (patterns.icn)
X#          tab(many('a'))
X#          tab(many('b'))
X#      }
X#
X#  or else use some combination of findre and the above.
X#    
X########################################################################
X#
X#  REGULAR EXPRESSION SYNTAX: Regular expression syntax is complex,
X#  and yet simple.  It is simple in the sense that most of its power
X#  is concentrated in about a dozen easy-to-learn symbols.  It is
X#  complex in the sense that, by combining these symbols with
X#  characters, you can represent very intricate patterns.
X#
X#  I make no pretense here of offering a full explanation of regular
X#  expressions, their usage, and the deeper nuances of their syntax.
X#  As noted above, this should be gleaned from a Unix manual.  For
X#  quick reference, however, I have included a brief summary of all
X#  the special symbols used, accompanied by an explanation of what
X#  they mean, and, in some cases, of how they are used (most of this
X#  is taken from the comments prepended to Jerry Nowlin's Icon-grep
X#  command, as posted a couple of years ago):
X#
X#     ^   -  matches if the following pattern is at the beginning
X#            of a line (i.e. ^# matches lines beginning with "#")
X#     $   -  matches if the preceding pattern is at the end of a line
X#     .   -  matches any single character
X#     +   -  matches from 1 to any number of occurrences of the
X#            previous expression (i.e. a character, or set of paren-
X#            thesized/bracketed characters)
X#     *   -  matches from 0 to any number of occurrences of the previous
X#            expression
X#     \   -  removes the special meaning of any special characters
X#            recognized by this program (i.e if you want to match lines
X#            beginning with a "[", write ^\[, and not ^[)
X#     |   -  matches either the pattern before it, or the one after
X#            it (i.e. abc|cde matches either abc or cde)
X#     []  -  matches any member of the enclosed character set, or,
X#            if ^ is the first character, any nonmember of the
X#            enclosed character set (i.e. [^ab] matches any character
X#	     _except_ a and b).
X#     ()  -  used for grouping (e.g. ^(abc|cde)$ matches lines consist-
X#            ing of either "abc" or "cde," while ^abc|cde$ matches
X#            lines either beginning with "abc" or ending in "cde")
X#
X#########################################################################
X#
X#  EXAMPLE program:
X#
X#  procedure main(a)
X#      while line := !&input do {
X#          token_list := tokenize_line(line,a[1])
X#          every write(!token_list)
X#      }
X#  end
X#
X#  procedure tokenize_line(s,sep)
X#      tmp_lst := []
X#      s ? {
X#          while field := tab(findre(sep)|0) &
X#          mark := __endpoint
X#          do {
X#              put(tmp_lst,"" ~== field)
X#              if pos(0) then break
X#              else tab(mark)
X#          }
X#      }
X#      return tmp_lst
X#  end
X#
X#  The above program would be compiled with findre (e.g. "icont
X#  test_prg.icn findre.icn") to produce a single executable which
X#  tokenizes each line of input based on a user-specified delimiter.
X#  Note how __endpoint is set soon after findre() succeeds.  Note
X#  also how empty fields are excluded with "" ~==, etc.  Finally, note
X#  that the temporary list, tmp_lst, is not needed.  It is included
X#  here merely to illustrate one way in which tokens might be stored.
X#
X#  Tokenizing is, of course, only one of many uses one might put
X#  findre to.  It is very helpful in allowing the user to construct
X#  automata at run-time.  If, say, you want to write a program that
X#  searches text files for patterns given by the user, findre would be
X#  a perfect utility to use.  Findre in general permits more compact
X#  expression of patterns than one can obtain using intrinsic Icon
X#  scanning facilities.  Its near complete compatibility with the Unix
X#  regexp library, moreover, makes for greater ease of porting,
X#  especially in cases where Icon is being used to prototype C code.
X#
X#########################################################################
X
X
Xglobal state_table, parends_present, slash_present
Xglobal biggest_nonmeta_str, __endpoint
Xrecord o_a_s(op,arg,state)
X
X
Xprocedure findre(re, s, i, j)
X
X    local p, default_val, x, nonmeta_len, tokenized_re, tmp
X    static FSTN_table, STRING_table
X    initial {
X	FSTN_table := table()
X	STRING_table := table()
X    }
X
X    if /re then {
X	FSTN_table := table()
X	STRING_table := table()
X	collect()  # do it *now*
X	return
X    }
X
X    if /s := &subject
X    then default_val := &pos
X    else default_val := 1
X
X    if \i then {
X	if i < 1 then
X	    i := *s + (i+1)
X    }
X    else i := default_val
X	
X    if \j then {
X	if j < 1 then
X	    j := *s + (j+1)
X    }
X    else j := *s+1
X
X    if /FSTN_table[re] then {
X	# If we haven't seen this re before, then...
X	if \STRING_table[re] then {
X	    # ...if it's in the STRING_table, use plain find()
X	    every p := find(STRING_table[re],s,i,j)
X	    do { __endpoint := p + *STRING_table[re]; suspend p }
X	    fail
X	}
X	else {
X	    # However, if it's not in the string table, we have to
X	    # tokenize it and check for metacharacters.  If it has
X	    # metas, we create an FSTN, and put that into FSTN_table;
X	    # otherwise, we just put it into the STRING_table.
X	    tokenized_re := tokenize(re)
X	    if 0 > !tokenized_re then {
X		# if at least one element is < 0, re has metas
X		MakeFSTN(tokenized_re) | err_out(re,2)
X		# both biggest_nonmeta_str and state_table are global
X		/FSTN_table[re] := [.biggest_nonmeta_str, copy(state_table)]
X	    }
X	    else {
X		# re has no metas; put the input string into STRING_table
X		# for future reference, and execute find() at once
X		tmp := ""; every tmp ||:= char(!tokenized_re)
X		insert(STRING_table,re,tmp)
X		every p := find(STRING_table[re],s,i,j)
X		do { __endpoint := p + *STRING_table[re]; suspend p }
X		fail
X	    }
X	}
X    }
X
X
X    if nonmeta_len := (1 < *FSTN_table[re][1]) then {
X	# If the biggest non-meta string in the original re
X	# was more than 1, then put in a check for it...
X	s[1:j] ? {
X	    tab(x := i to j - nonmeta_len) &
X		(find(FSTN_table[re][1]) | fail) \ 1 &
X		(__endpoint := apply_FSTN(&null,FSTN_table[re][2])) &
X		(suspend x)
X	}
X    }
X    else {
X	#...otherwise it's not worth worrying about the biggest nonmeta str
X	s[1:j] ? {
X	    tab(x := i to j) &
X	    (__endpoint := apply_FSTN(&null,FSTN_table[re][2])) &
X	    (suspend x)
X	}
X    }
X
Xend
X
X
X
Xprocedure apply_FSTN(ini,tbl)
X
X    local biggest_pos, POS, tmp, fin
X    static s_tbl
X
X    /ini := 1 & s_tbl := tbl & biggest_pos := 1
X    if ini = 0 then {
X	return &pos
X    }
X    POS := &pos
X    fin := 0
X
X    repeat {
X	if tmp := !s_tbl[ini] &
X	    tab(tmp.op(tmp.arg))
X	then {
X	    if tmp.state = fin
X	    then return &pos
X	    else ini := tmp.state
X	}
X	else (&pos := POS, fail)
X    }
X
Xend
X    
X
X
Xprocedure tokenize(s)
X
X    local token_list, chr, tmp, b_loc, next_one, fixed_length_token_list, i
X
X    token_list := list()
X    s ? {
X	tab(many('*+?|'))
X	while chr := move(1) do {
X	    if chr == "\\"
X	    # it can't be a metacharacter; remove the \ and "put"
X	    # the integer value of the next chr into token_list
X	    then put(token_list,ord(move(1))) | err_out(s,2,chr)
X	    else if any('*+()|?.$^',chr)
X	    then {
X		# Yuck!  Egrep compatibility stuff.
X		case chr of {
X		    "*"    : {
X			tab(many('*+?'))
X			put(token_list,-ord("*"))
X		    }
X		    "+"    : {
X			tmp := tab(many('*?+')) | &null
X			if upto('*?',\tmp)
X			then put(token_list,-ord("*"))
X			else put(token_list,-ord("+"))
X		    }
X		    "?"    : {
X			tmp := tab(many('*?+')) | &null
X			if upto('*+',\tmp)
X			then put(token_list,-ord("*"))
X			else put(token_list,-ord("?"))
X		    }
X		    "("    : {
X			tab(many('*+?'))
X			put(token_list,-ord("("))
X		    }
X		    default: {
X			put(token_list,-ord(chr))
X		    }
X		}
X	    }
X	    else {
X		case chr of {
X		    # More egrep compatibility stuff.
X		    "["    : {
X			b_loc := find("[") | *&subject+1
X			every next_one := find("]",,,b_loc)
X			\next_one ~= &pos | err_out(s,2,chr)
X			put(token_list,-ord(chr))
X		    }
X                    "]"    : {
X			if &pos = (\next_one+1)
X			then put(token_list,-ord(chr)) &
X			     next_one := &null
X			else put(token_list,ord(chr))
X		    }
X		    default: put(token_list,ord(chr))
X		}
X	    }
X	}
X    }
X
X    token_list := UnMetaBrackets(token_list)
X
X    fixed_length_token_list := list(*token_list)
X    every i := 1 to *token_list
X    do fixed_length_token_list[i] := token_list[i]
X    return fixed_length_token_list
X
Xend
X
X
X
Xprocedure UnMetaBrackets(l)
X
X    # Since brackets delineate a cset, it doesn't make
X    # any sense to have metacharacters inside of them.
X    # UnMetaBrackets makes sure there are no metacharac-
X    # ters inside of the braces.
X
X    local tmplst, i, Lb, Rb
X
X    tmplst := list(); i := 0
X    Lb := -ord("[")
X    Rb := -ord("]")
X
X    while (i +:= 1) <= *l do {
X	if l[i] = Lb then {
X	    put(tmplst,l[i])
X	    until l[i +:= 1] = Rb
X	    do put(tmplst,abs(l[i]))
X	    put(tmplst,l[i])
X	}
X	else put(tmplst,l[i])
X    }
X    return tmplst
X
Xend
X
X
X
Xprocedure MakeFSTN(l,INI,FIN)
X
X    # MakeFSTN recursively descends through the tree structure
X    # implied by the tokenized string, l, recording in (global)
X    # fstn_table a list of operations to be performed, and the
X    # initial and final states which apply to them.
X
X    local i, inter, inter2, tmp, Op, Arg
X    static Lp, Rp, Sl, Lb, Rb, Caret_inside, Dot, Dollar, Caret_outside
X    # global biggest_nonmeta_str, slash_present, parends_present
X    initial {
X	Lp := -ord("("); Rp := -ord(")")
X	Sl := -ord("|")
X	Lb := -ord("["); Rb := -ord("]"); Caret_inside := ord("^")
X	Dot := -ord("."); Dollar := -ord("$"); Caret_outside := -ord("^")
X    }
X
X    /INI := 1 & state_table := table() &
X    NextState("new") & biggest_nonmeta_str := ""
X    /FIN := 0
X
X    # I haven't bothered to test for empty lists everywhere.
X    if *l = 0 then {
X	/state_table[INI] := []
X	put(state_table[INI],o_a_s(zSucceed,&null,FIN))
X	return
X    }
X
X    # HUNT DOWN THE SLASH (ALTERNATION OPERATOR)
X    every i := 1 to *l do {
X	if l[i] = Sl & tab_bal(l,Lp,Rp) = i then {
X	    if i = 1 then err_out(l,2,char(abs(l[i]))) else {
X		/slash_present := "yes"
X		inter := NextState()
X		inter2:= NextState()
X		MakeFSTN(l[1:i],inter2,FIN)
X		MakeFSTN(l[i+1:0],inter,FIN)
X		/state_table[INI] := []
X		put(state_table[INI],o_a_s(apply_FSTN,inter2,0))
X		put(state_table[INI],o_a_s(apply_FSTN,inter,0))
X		return
X	    }
X	}
X    }
X
X    # HUNT DOWN PARENTHESES
X    if l[1] = Lp then {
X	i := tab_bal(l,Lp,Rp) | err_out(l,2,"(")
X	inter := NextState()
X	if any('*+?',char(abs(0 > l[i+1]))) then {
X	    case l[i+1] of {
X		-ord("*")   : {
X		    /state_table[INI] := []
X		    put(state_table[INI],o_a_s(apply_FSTN,inter,0))
X		    MakeFSTN(l[2:i],INI,INI)
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X		-ord("+")   : {
X		    inter2 := NextState()
X		    /state_table[inter2] := []
X		    MakeFSTN(l[2:i],INI,inter2)
X		    put(state_table[inter2],o_a_s(apply_FSTN,inter,0))
X		    MakeFSTN(l[2:i],inter2,inter2)
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X		-ord("?")   : {
X		    /state_table[INI] := []
X		    put(state_table[INI],o_a_s(apply_FSTN,inter,0))
X		    MakeFSTN(l[2:i],INI,inter)
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X	    }
X	}
X	else {
X	    MakeFSTN(l[2:i],INI,inter)
X	    MakeFSTN(l[i+1:0],inter,FIN)
X	    return
X	}
X    }
X    else {     # I.E. l[1] NOT = Lp (left parenthesis as -ord("("))
X	every i := 1 to *l do {
X	    case l[i] of {
X		Lp     : {
X		    inter := NextState()
X		    MakeFSTN(l[1:i],INI,inter)
X		    /parends_present := "yes"
X		    MakeFSTN(l[i:0],inter,FIN)
X		    return
X		}
X		Rp     : err_out(l,2,")")
X	    }
X	}
X    }
X
X    # NOW, HUNT DOWN BRACKETS
X    if l[1] = Lb then {
X	i := tab_bal(l,Lb,Rb) | err_out(l,2,"[")
X	inter := NextState()
X	tmp := ""; every tmp ||:= char(l[2 to i-1])
X	if Caret_inside = l[2]
X	then tmp := ~cset(Expand(tmp[2:0]))
X	else tmp :=  cset(Expand(tmp))
X	if any('*+?',char(abs(0 > l[i+1]))) then {
X	    case l[i+1] of {
X		-ord("*")   : {
X		    /state_table[INI] := []
X		    put(state_table[INI],o_a_s(apply_FSTN,inter,0))
X		    put(state_table[INI],o_a_s(any,tmp,INI))
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X		-ord("+")   : {
X		    inter2 := NextState()
X		    /state_table[INI] := []
X		    put(state_table[INI],o_a_s(any,tmp,inter2))
X		    /state_table[inter2] := []
X		    put(state_table[inter2],o_a_s(apply_FSTN,inter,0))
X		    put(state_table[inter2],o_a_s(any,tmp,inter2))
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X		-ord("?")   : {
X		    /state_table[INI] := []
X		    put(state_table[INI],o_a_s(apply_FSTN,inter,0))
X		    put(state_table[INI],o_a_s(any,tmp,inter))
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X	    }
X	}
X	else {
X	    /state_table[INI] := []
X	    put(state_table[INI],o_a_s(any,tmp,inter))
X	    MakeFSTN(l[i+1:0],inter,FIN)
X	    return
X	}
X    }
X    else {           # I.E. l[1] not = Lb
X	every i := 1 to *l do {
X	    case l[i] of {
X		Lb     : {
X		    inter := NextState()
X		    MakeFSTN(l[1:i],INI,inter)
X		    MakeFSTN(l[i:0],inter,FIN)
X		    return
X		}
X		Rb     : err_out(l,2,"]")
X	    }
X	}
X    }
X
X    # FIND INITIAL SEQUENCES OF POSITIVE INTEGERS, CONCATENATE THEM
X    if i := match_positive_ints(l) then {
X	inter := NextState()
X	tmp := Ints2String(l[1:i])
X	# if a slash has been encountered already, forget optimizing
X        # in this way; if parends are present, too, then forget it,
X        # unless we are at the beginning or end of the input string
X	if  INI = 1 | FIN = 2 | /parends_present &
X	    /slash_present & *tmp > *biggest_nonmeta_str
X	then biggest_nonmeta_str := tmp
X	/state_table[INI] := []
X	put(state_table[INI],o_a_s(match,tmp,inter))
X	MakeFSTN(l[i:0],inter,FIN)
X	return
X    }
X
X    # OKAY, CLEAN UP ALL THE JUNK THAT'S LEFT
X    i := 0
X    while (i +:= 1) <= *l do {
X	case l[i] of {
X	    Dot          : { Op := any;   Arg := &cset }
X	    Dollar       : { Op := pos;   Arg := 0     }
X	    Caret_outside: { Op := pos;   Arg := 1     }
X	    default      : { Op := match; Arg := char(0 < l[i]) }
X	} | err_out(l,2,char(abs(l[i])))
X	inter := NextState()
X	if any('*+?',char(abs(0 > l[i+1]))) then {
X	    case l[i+1] of {
X		-ord("*")   : {
X		    /state_table[INI] := []
X		    put(state_table[INI],o_a_s(apply_FSTN,inter,0))
X		    put(state_table[INI],o_a_s(Op,Arg,INI))
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X		-ord("+")   : {
X		    inter2 := NextState()
X		    /state_table[INI] := []
X		    put(state_table[INI],o_a_s(Op,Arg,inter2))
X		    /state_table[inter2] := []
X		    put(state_table[inter2],o_a_s(apply_FSTN,inter,0))
X		    put(state_table[inter2],o_a_s(Op,Arg,inter2))
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X		-ord("?")   : {
X		    /state_table[INI] := []
X		    put(state_table[INI],o_a_s(apply_FSTN,inter,0))
X		    put(state_table[INI],o_a_s(Op,Arg,inter))
X		    MakeFSTN(l[i+2:0],inter,FIN)
X		    return
X		}
X	    }
X	}
X	else {
X	    /state_table[INI] := []
X	    put(state_table[INI],o_a_s(Op,Arg,inter))
X	    MakeFSTN(l[i+1:0],inter,FIN)
X	    return
X	}
X    }
X
X    # WE SHOULD NOW BE DONE INSERTING EVERYTHING INTO state_table
X    # IF WE GET TO HERE, WE'VE PARSED INCORRECTLY!
X    err_out(l,4)
X
Xend
X
X
X
Xprocedure NextState(new)
X    static nextstate
X    if \new then nextstate := 1
X    else nextstate +:= 1
X    return nextstate
Xend
X
X
X
Xprocedure err_out(x,i,elem)
X    writes(&errout,"Error number ",i," parsing ",image(x)," at ")
X    if \elem 
X    then write(&errout,image(elem),".")
X    else write(&errout,"(?).")
X    exit(i)
Xend
X
X
X
Xprocedure zSucceed()
X    return .&pos
Xend
X
X
X
Xprocedure Expand(s)
X
X    local s2, c1, c2
X
X    s2 := ""
X    s ? {
X	s2 ||:= ="^"
X	s2 ||:= ="-"
X	while s2 ||:= tab(find("-")-1) do {
X	    if (c1 := move(1), ="-",
X		c2 := move(1),
X		c1 << c2)
X	    then every s2 ||:= char(ord(c1) to ord(c2))
X	    else s2 ||:= 1(move(2), not(pos(0))) | err_out(s,2,"-")
X	}
X	s2 ||:= tab(0)
X    }
X    return s2
X
Xend
X
X
X
Xprocedure tab_bal(l,i1,i2)
X
X    local i, i1_count, i2_count
X
X    i := 0
X    i1_count := 0; i2_count := 0
X    while (i +:= 1) <= *l do {
X	case l[i] of {
X	    i1  : i1_count +:= 1
X	    i2  : i2_count +:= 1
X	}
X	if i1_count = i2_count
X	then suspend i
X    }
X
Xend
X
X
Xprocedure match_positive_ints(l)
X    
X    # Matches the longest sequence of positive integers in l,
X    # beginning at l[1], which neither contains, nor is fol-
X    # lowed by a negative integer.  Returns the first position
X    # after the match.  Hence, given [55, 55, 55, -42, 55],
X    # match_positive_ints will return 3.  [55, -42] will cause
X    # it to fail rather than return 1 (NOTE WELL!).
X
X    local i
X
X    every i := 1 to *l do {
X	if l[i] < 0
X	then return (3 < i) - 1 | fail
X    }
X    return *l + 1
X
Xend
X
X
Xprocedure Ints2String(l)
X
X    local tmp
X
X    tmp := ""
X    every tmp ||:= char(!l)
X    return tmp
X
Xend
X
X
Xprocedure StripChar(s,s2)
X
X    local tmp
X
X    if find(s2,s) then {
X	tmp := ""
X	s ? {
X	    while tmp ||:= tab(find("s2"))
X	    do tab(many(cset(s2)))
X	    tmp ||:= tab(0)
X	}
X    }
X    return \tmp | s
X
Xend
SHAR_EOF
true || echo 'restore of findre.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= huffcode.icn ==============
if test -f 'huffcode.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping huffcode.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting huffcode.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'huffcode.icn' &&
X############################################################################
X#
X#	Name:	 huffcode.icn
X#
X#	Title:	 huffman coding tools
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.4
X#
X############################################################################
X#  
X#  An odd assortment of tools that lets me compress text using an
X#  Iconish version of a generic Huffman algorithm.  See block_encode().
X#
X############################################################################
X#
X#  Links: outbits.icn inbits.icn
X#
X#  See also: press.icn
X#
X############################################################################
X
Xrecord node(l,r,n)
Xrecord _N(l,r)
Xrecord leaf(c,n)
Xrecord hcode(c,i,len)
X
X# For debugging purposes.
X# link ximage
X
Xprocedure count_chars(s, char_tbl)
X
X    #
X    # Count chars in s, placing stats in char_tbl (keys = chars in
X    # s, values = leaf records, with the counts for each chr in s
X    # contained in char_tbl[chr].n).
X    #
X    local chr
X    initial {
X	/char_tbl & stop("count_chars:  need 2 args - 1 string, 2 table")
X	*char_tbl ~= 0 & stop("count_chars:  start me with an empty table!")
X    }
X
X    s ? {
X	while chr := move(1) do {
X	    /char_tbl[chr]   := leaf(chr,0)
X	    char_tbl[chr].n +:= 1
X	}
X    }
X
X#    write(ximage(char_tbl))
X    return *char_tbl		# for lack of anything better
X
Xend
X
X
Xprocedure heap_init(char_tbl)
X
X    #
X    # Create heap data structure out of the table filled out by
X    # successive calls to count_chars(s,t).  The heap is just a
X    # list.  Naturally, it's size can be obtained via *heap.
X    #
X    local heap
X
X    heap := list()
X    every push(heap, !char_tbl) do {
X	resettle_heap(heap, 1)
X#	write(ximage(heap))
X    }
X
X    return heap
X
Xend
X
X
Xprocedure resettle_heap(h, k)
X
X    #
X    # Based loosely on Sedgewick (2nd. ed., 1988), p. 160.  Take k-th
X    # node on the heap, and walk down the heap, switching this node
X    # along the way with the child whose value is the least AND whose
X    # value is less than this node's.  Stop when you find no children
X    # whose value is less than that of the original node.  Elements on
X    # heap are records of type leaf, with the values contained in the
X    # "n" field.
X    #
X    local j
X
X    # While we haven't spilled off the end of the heap (the size of the
X    # heap is *h; *h / 2 is the biggest k we need to look at)...
X    while k <= (*h / 2) do {
X
X	# ...double k, assign the result to j.
X	j := k+k
X
X	# If we aren't at the end of the heap...
X	if j < *h then {
X	    # ...check to see which of h[k]'s children is the smallest,
X	    # and make j point to it.
X	    if h[j].n > h[j+1].n then
X		# h[j] :=: h[j+1]
X		j +:= 1
X	}
X
X	# If the current parent (h[k]) has a value less than those of its
X	# children, then break; we're done.
X	if h[k].n <= h[j].n then break
X
X	# Otherwise, switch the parent for the child, and loop around
X        # again, with k (the pointer to the parent) now pointing to the
X	# new offset of the element we have been working on.
X	h[k] :=: h[j]
X	k := j
X
X    }
X
X    return k
X	
Xend
X
X
Xprocedure heap_2_huffman_tree(h)
X
X    #
X    # Construct the Huffman tree out of heap h.  Find the smallest
X    # element, pop it off the heap, then reshuffle the heap.  After
X    # reshuffling, replace the top record on the stack with a node()
X    # record whose n field equal to the sum of the n fields for the
X    # element popped off the stack originally, and the one that is
X    # now about to be replaced.  Link the new node record to the 2
X    # elements on the heap it is now replacing.  Reshuffle the heap
X    # again, then repeat.  You're done when the size of the heap is
SHAR_EOF
true || echo 'restore of huffcode.icn failed'
fi
echo 'End of  part 3'
echo 'File huffcode.icn is continued in part 4'
echo 4 > _shar_seq_.tmp
exit 0

exit 0 # Just in case...
-- 
Kent Landfield                   INTERNET: kent@sparky.IMD.Sterling.COM
Sterling Software, IMD           UUCP:     uunet!sparky!kent
Phone:    (402) 291-8300         FAX:      (402) 291-4362
Please send comp.sources.misc-related mail to kent@uunet.uu.net.
