#!/bin/sh
#
# search (previously httpd_wais)				Version 1.0
#
#	/htbin script doing WAIS inverted index searches
#	and producing results in HTML.
#	Creates a WAIS inverted index of HTML document
#	tree if the index doesn't already exist.
#
#	This program is places to server's htbin directory
#	(as given with htbin rule in the rule file).
#
# PARAMETERS:
#	$1 = name of the .html file from which request was given
#	$2, $3, ... = keywords (whitespace as separators)
#
# OUTPUT TO STDOUT, FORMAT:
#
#	Content-Type: text/html
#
#	<HEAD>...
#	
#
# NEEDS THE FOLLOWING PROGRAMS IN $PROGDIR:
#	waissearch, waisindex, extract_title.
#
# ORIGINAL HGREP-BASED IMPLEMENTATION BY:
#	Christian Neuss	
#
# REWRITTEN BY:
#	Ari Luotonen, CERN, 24 Nov 1993, luotonen@dxcern.cern.ch
#
# !!! SET THE FOLLOWING VARIABLES TO CORRECT VALUES !!!
#

PROGDIR="/apps/WWW/bin"		# Directory where executables reside
WWW_HOME="/CERN_WWW/CERNWeb"	# Home of WWW document tree
INDEX_HOME="/CERN_WWW/Index"	# Home of inverted index tree

KEYWORD_ESCAPE="index_keywords"	# This string is replaced by keywords
				# in the index.* files.

#
# !!! YOU SHOULDN'T NEED TO CHANGE ANYTHING AFTER THIS LINE !!!
#

# Binaries

WAISSEARCH="$PROGDIR/waissearch"
WAISINDEX="$PROGDIR/waisindex"
EXTRACT_TITLE="$PROGDIR/extract_title"	# Extracts title of HTML file


# Directories

WWW_DIR=`expr $1 : '\(.*\)/.*'`	# Directory of the requesting file.
INDEX_DIR=$INDEX_HOME/$WWW_DIR	# Corresponding index directory.


# WWW_HOME-relative file name of the <ISINDEX> file
# (used only in response to say what file the query was entered from).

RELATIVE_ISINDEX_FILE=`expr $1 : $WWW_HOME'\(.*\)'`


# Files

INDEX_MAP="$INDEX_HOME/IndexMap"
INDEX_FILE="$INDEX_DIR/WAIS-index"
PRELUDE_FILE="$WWW_DIR/index.prelude"
POSTLUDE_FILE="$WWW_DIR/index.postlude"
NOMATCH_FILE="$WWW_DIR/index.nomatch"

shift
KEYWORDS="$*"				# Keywords to look for

# Write header section

echo "Content-Type: text/html"
echo

HAD_TO_CREATE=0

(
	if test -f $PRELUDE_FILE
	then
		sed s/$KEYWORD_ESCAPE/"$KEYWORDS"/g $PRELUDE_FILE
	else
		echo '<HEAD><ISINDEX><TITLE>(Query)</TITLE></HEAD>'
		echo '<BODY><H1>Query result</H1><P>'
		echo "From document <EM>\"$RELATIVE_ISINDEX_FILE\"</EM> your"
		echo "query <EM>\"$KEYWORDS\"</EM> produced these results:"
	fi

	# Check if we first need to create the inverted index

	if test ! -r $INDEX_FILE.inv
	then
		HAD_TO_CREATE=1

		# Check for existing directories and create if needed

		DIRECTORIES=`echo $INDEX_FILE |
			     awk -F/ '{for(i=1; i<NF; i++) print $i}'`
		DIR=""
		for i in $DIRECTORIES
		do
			DIR=$DIR/$i
			if test ! -d $DIR
			then
				mkdir $DIR
			fi
		done

		# Inform the automatic index updater about this new index
		# i.e. write an entry to IndexMap file.

		echo "$INDEX_FILE	$WWW_DIR" >> $INDEX_MAP

		# Create the inverted index

		cd $WWW_DIR
		INDEXED_FILES=`find . \( -name \*.html -o -name \*.txt \) -a ! -name ,\* -print`
		$WAISINDEX -d $INDEX_FILE -nocat -t text $INDEXED_FILES 1>/dev/null 2>&1
	fi

	# At this point we sould have an existing index file

	if test ! -r $INDEX_FILE.inv
	then
		echo '<P><H1>ERROR</H1>'
		echo 'Failed to create an inverted index file on the fly.'
		echo 'There must be something wrong with directory'
		echo 'protections. <EM>Sorry.</EM><P></BODY>'
		exit
	fi

	# At this point we are sure we have an inverted index file
	# Go for it!

	echo "<UL>"

	$WAISSEARCH -d $INDEX_FILE $KEYWORDS </dev/null  |  grep 'Score:' |
	while read LINE
	do 
		set $LINE
		SCORE=`expr $3 : '\(.*\),'`
		LINES=$5
		NEW_URL_DIR=`expr $7 : $WWW_DIR"/*\\(.*[^/]\\)/*'"`
		NEW_URL_FILE=`expr $6 : "'\\(.*\\)"`
		if test $NEW_URL_FILE = "Information"
		then			# Hack to work around strange
			continue	# "Information on database: ..."
		fi			# line returned by waissearch.
		if test "$NEW_URL_DIR"
		then
			NEW_URL=$NEW_URL_DIR/$NEW_URL_FILE
		else
			NEW_URL=$NEW_URL_FILE
		fi
		echo -n "<LI> $SCORE ($LINES lines): <A HREF=\"$NEW_URL\">"
		echo `$EXTRACT_TITLE "$WWW_DIR" "$NEW_URL"` "</A>"
	done | grep LI	# trick to get return value 
	EXIT_STATUS=$?

	echo "</UL>"

	if test $EXIT_STATUS = 1
	then
		if test -f $NOMATCH_FILE
		then
			sed s/$KEYWORD_ESCAPE/"$KEYWORDS"/g $NOMATCH_FILE
		else
			echo 'No matching files have been found.<P>'
		fi
	fi

	if test -f $POSTLUDE_FILE
	then
		sed s/$KEYWORD_ESCAPE/"$2"/g $POSTLUDE_FILE
	else
		echo "(At most top 40; found by WAIS index on"
		echo "WWW server <EM>`hostname`</EM>).<P>"

		if test $HAD_TO_CREATE = 1
		then
			echo "<EM>Notice:</EM>"
			echo "The inverted index file didn't exist and it"
			echo "had to be created on the fly. Next time the"
			echo "searches will be much quicker.<P>"
		fi
		echo "</BODY>"
	fi
)

