/***************************************************************
 * File: msrch.c
 * Author: John Rex
 * Purpose: search text for multiple keywords simultaneously
 * Compilers: Turbo C 2.0
 *            Microsoft C 5.1
 * Memory model: any
 *
 * Switches: TEST - if == 1, a test driver is compiled
 *			 MAXCHAR - max number of different symbols recognized
 *
 * Usage: The sample driver illustrates all of the key points.  In brief,
 *        there are three routines:
 *			void msrch_int(struct kword *);
 *			void msrch_go(int (*msrch_data) (), void (*msrch_signal) (char *));
 *			void msrch_end(void);
 *
 *		  (1) Pass msrch_init() a list of words to search for 
 *        (2) msrch_go() does the work.  It uses pointers to two functions--
 *             the first retrieves a character and the second is called when
 *             a match has been found 
 *        (3) msrch_end() cleans up the work areas
 *
 * Technique: A finite state string pattern matching machine that
 *            recognizes the desired word(s) is built and then used
 *            to scan the text.  See text for detailed discussion.
 *
 * Reference: Aho AV, Corasick MJ: Efficient string matching: An aid to
 *            bibiliographic search.  Comm ACM 1975; 18:333-340.
 *
 **************************************************************/

#ifdef DEBUG
#define TELLME(x) x
#else
#define TELLME(x)
#endif

#define TEST 0              /* set to 1 to get test driver */
#pragma page
/* local declarations */

#include "msrch.h"

#define MAXCHAR	 256		/* max number of different chars we search for */
static int maxstate;		/* max number of states we have room for */

/*******************************************************
 * match_array[] and goto_array[] are the Goto function
 *******************************************************/
static int *match_array;	/* first level of matching.  Possible values:
								(1) EMPTY_SLOT
								(2) a character
								(3) MULTI_WAY       */
#define MULTI_WAY	-1		/* flag for match_array */
#define EMPTY_SLOT	-2		/* flag for match_array */

union goto_table {			/* values in match_array take us here */
	int goto_state;			/* directly here if match_array is a character */
	int *branch_table;		/* or to this MULTI_WAY branching table */
} static *goto_array;

#define FAIL_STATE	-1		/* in goto_state or branch_table,
							    this means failure */

/*******************************************************
 * out_array[] is the Output function
 *******************************************************/
static struct kword **out_array;	/* list of keywords 'found' by states */

/*******************************************************
 * fail_array[] is the Fail function
 *******************************************************/
static int *fail_array;		/* failure transition array */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#pragma page
/*****************************************************************
 * msrch_init() and subroutines
 * Purpose: setup tables needed by msrch_go()
 ****************************************************************/

static int highstate;		/* common variable to track next free state */

/* functions we use */
static void enter(unsigned char *), add_state_trans(int, int, int);
static void compute_fail(void);

void msrch_init(struct kword *klist)
{
	int i;
	struct kword *ktemp;

	/* compute max number of states possible */
	maxstate = 1;
	for (ktemp=klist; ktemp != NULL; ktemp = ktemp->next)
		maxstate += strlen(ktemp->word);

	/* allocate space for arrays */
	match_array = (int *) malloc(sizeof(int) * maxstate);
	goto_array = (union goto_table *) malloc(sizeof(union goto_table) * maxstate);
	out_array = (struct kword **) malloc(sizeof(struct kword *) * maxstate);
	fail_array = (int *) malloc(sizeof(int) * maxstate);

	/* initialize state arrays */
	for (i=0; i<maxstate; i++) { 
		match_array[i] = EMPTY_SLOT;
		out_array[i] = NULL;
	}

	/* initialize state_array[0] */
	highstate = 0;
	add_state_trans(0, 'a', FAIL_STATE);
	add_state_trans(0, 'b', FAIL_STATE);	/* force a multiway table */

	/* step thru keywords */
	for(; klist != NULL; klist = klist->next)
		enter(klist->word);

	/* setup return to zero transitions for state[0] */
	for (i=0; i<MAXCHAR; i++)
		if (goto_array[0].branch_table[i] == FAIL_STATE)
			goto_array[0].branch_table[i] = 0;

	/* and compute failure array */
	compute_fail();
}
#pragma page
static void add_state_trans(int oldstate, int matchchar, int newstate)
/* add transition from oldstate -> newstate for matchchar */
{
	int i, *temp;

	/* is this slot empty? */
	if (match_array[oldstate] == EMPTY_SLOT) { /* this is easy */
		match_array[oldstate] = matchchar;
		goto_array[oldstate].goto_state = newstate;
	}

	/* is there already a multi-way table? */
	else if (match_array[oldstate] == MULTI_WAY) /* this is easy, too */
		goto_array[oldstate].branch_table[matchchar] = newstate;

	/* need to convert to multi-way table */
	else {
		temp = (int *) malloc(sizeof(int) * MAXCHAR);
		for (i=0; i<MAXCHAR; i++)
			temp[i] = FAIL_STATE;

		/* copy data from single way branch */
		temp[match_array[oldstate]] = goto_array[oldstate].goto_state;
		
		/* and new data */
		temp[matchchar] = newstate;

		/* and load it all into state_array */
		match_array[oldstate] = MULTI_WAY;
		goto_array[oldstate].branch_table = temp;
	}
}
#pragma page
static void enter(unsigned char *kword)
/* add kword to list of words our machine recognizes */
{
	int state, k;
	char *save;
	struct kword *ktemp;

	state = 0;
	save = kword;	/* keep a copy */

	/* first, see if we can place this word on top of an existing one */
	for (;*kword != '\0'; kword++) {
		/* is this a single char slot? */
		if (match_array[state] == *kword)
			state = goto_array[state].goto_state;

		/* multi-way? */
		else if (match_array[state] == MULTI_WAY) {
			if ((k = goto_array[state].branch_table[*kword]) == FAIL_STATE)
				break;
			else
				state = k;	/* we have a transition for this char */
		}

		/* no match for this char */
		else break;
	}

	/* now add new states as needed */
	for (;*kword != '\0'; kword++) {
		highstate += 1;
		if (highstate >= maxstate) { /* uh-oh ... */
			fputs("msrch.c: INTERNAL ERROR - too many states\n", stderr);
			exit(1);
		}
		add_state_trans(state, *kword, highstate);
		state = highstate;
	}

	/* now add this keyword to output list for final state */
	ktemp = (struct kword *) malloc(sizeof(struct kword));
	ktemp->word = save;
	ktemp->next = out_array[state];
	out_array[state] = ktemp;
}
#pragma page
static void queue_add(int *queue, int qbeg, int new);
static void find_fail(int state, int s, int a);

static void compute_fail()
/* build fail_array and update output_array */
{
    int *queue, qbeg, r, s;
	int i;

	/* allocate a queue */
	queue = (int *) malloc(sizeof(int) * maxstate);
	qbeg = 0;
	queue[0] = 0;

	/* scan first level and setup initial values for fail_array */
	for (i=0; i<MAXCHAR; i++) 
		if ((s = goto_array[0].branch_table[i]) != 0) {
			fail_array[s] = 0;
			queue_add(queue, qbeg, s);
		}

	/* now scan lower levels */
	while (queue[qbeg] != 0) {
		r = queue[qbeg];	/* pull off state from front of queue */
		qbeg = r;			/* and advance qbeg */

        TELLME(printf("Now investigating state %d\n",r);)
		/* now investigate this state */
		if (match_array[r] == EMPTY_SLOT)
			continue;		/* no more to do */
		else if (match_array[r] == MULTI_WAY) {
			/* scan its subsidiary states */
			for(i=0; i<MAXCHAR; i++)	/* scan branch_table */
				if ((s = goto_array[r].branch_table[i]) != FAIL_STATE) {
					queue_add(queue, qbeg, s);	/* add new state to queue */
					find_fail(fail_array[r], s, i);
				}
		}
		else { /* single char */
			queue_add(queue, qbeg, goto_array[r].goto_state);
			find_fail(fail_array[r], goto_array[r].goto_state, match_array[r]);
		}
	}

	/* tidy up */
	free(queue);
}
#pragma page
static void find_fail(int s1, int s2, int a)
/* Actually compute failure transition.  We start knowing that 'a' would
   normally cause us to go from state s1 to s2.  To compute the failure
   value we backtrack in search of other places 'a' might go. */ 
{
	int on_fail;
	struct kword *ktemp, kdummy, *out_copy, *kscan;

    TELLME(printf("find_fail: invoked with (%d, %d, %c)\n", s1,s2,a);)
    for(;;s1 = fail_array[s1])
        if (match_array[s1] == a) {
			if ((on_fail = goto_array[s1].goto_state) != FAIL_STATE)
				break;
        }
        else if (match_array[s1] == MULTI_WAY) {
            if ((on_fail = goto_array[s1].branch_table[a]) != FAIL_STATE)
				break;
        }

    TELLME(printf("find_fail: on_fail is %d\n", on_fail);)
	fail_array[s2] = on_fail;	

	/* merge output lists */
	
	/* first, make a copy of out_array[on_fail] */
	if (out_array[on_fail] == NULL)
		out_copy = NULL;
	else {
		kscan = out_array[on_fail];
		out_copy = malloc(sizeof(struct kword));
		out_copy->word = kscan->word;
		out_copy->next = NULL;
		for(kscan=kscan->next; kscan!=NULL; kscan=kscan->next) {
			ktemp = malloc(sizeof(struct kword));
			ktemp->word = kscan->word;
			ktemp->next = out_copy->next;
			out_copy->next = ktemp;
		}
	}

	/* now merge them */	
	if ((kdummy.next = out_array[s2]) != NULL) {
		ktemp = &kdummy;
		for(;ktemp->next->next != NULL; ktemp = ktemp->next);
		ktemp->next->next = out_copy;
	}
	else
		out_array[s2] = out_copy;
}

static void queue_add(int *queue, int qbeg, int new)
/* add new to end of queue */
{
	int q;

	q = queue[qbeg];
	if (q == 0) /* is list empty? */
		queue[qbeg] = new; /* yes */

	/* no - scan to penultimate link */
	else {
		for(; queue[q] != 0; q = queue[q]);
		queue[q] = new;	/* put this state at end of queue */
	}

	/* and terminate list */
	queue[new] = 0;			
}
#pragma page
/*****************************************************************
 * msrch_go() 
 * Purpose: do the actual search
 ****************************************************************/

void msrch_go(int (*msrch_data) (), void (*msrch_signal) (char *))
/* search routine */
{
	int state, c, g, m;
	struct kword *kscan;

	state = 0;
	while ((c = msrch_data()) != EOF) {
		/* what is goto(state, c)? */
		for(;;) {
			/* we cheat slightly in the interest of speed/simplicity.  The
			   machine will spend most of its time it state==0, and this state
			   is always a MULTI_WAY table.  Since this is a simple test, we 
			   make it first and try to save the calculation of an array index */

			if (state == 0 || (m = match_array[state]) == MULTI_WAY)
				g = goto_array[state].branch_table[c];
			else if (m == c)
				g = goto_array[state].goto_state;
			else
				g = FAIL_STATE;

			if (g != FAIL_STATE)
				break;
			state = fail_array[state];
		}
		state = g;

		/* anything to output? */
		if ((kscan = out_array[state]) != NULL)
			for(;kscan != NULL; kscan = kscan->next)
				msrch_signal(kscan->word);
	}
}
#pragma page
/*****************************************************************
 * msrch_end()
 * Purpose: clean up when done
 ****************************************************************/

void msrch_end()
/* free all the arrays we created */
{
	int i;
	struct kword *kscan;

	for (i=0; i<maxstate; i++)
		if (match_array[i] == MULTI_WAY)
			free(goto_array[i].branch_table);

	free(match_array);
	free(goto_array);
	free(fail_array);

	for (i=0; i<maxstate; i++)
		if (out_array[i] != NULL)
			for(kscan=out_array[i]; kscan!=NULL; kscan=kscan->next)
				free (kscan);
	free(out_array);
}

#pragma page
/*****************************************************************
 * Test driver
 * 
 * The routine expects a command line of the form
 *    msrch file word-1 word-2 word-3 .... word-n
 *
 * It will then search file for all of the words on the command line.
 * The results are written to stdout.  This illustrates all of the
 * features of using the multisearch routines.
 *
 * This is an admittedly simple design--the search routine would
 * certainly be faster if the character fetch routine was put
 * directly into the msrch_go() module.  However, to avoid using
 * application specific code in the demonstration version of these
 * routines, I have coded it as a separate subroutine.
 ****************************************************************/

#if TEST == 1
#define BUFSIZE 200

FILE *infile;
char inbuf[BUFSIZE];
char *inbufptr;
int linecount;

/* declare the routines that msrch_go() will use */
int retrieve_char(void);
void found_word();

main(int argc, char **argv)
{
	char infile_name[80];
	struct kword *khead, *ktemp;
	int i;

	if (argc < 3) {
		fprintf(stderr, "Usage: msrch infile word-1 word-2 ... word-n\n");
		exit(1);
	}

	strcpy(infile_name, argv[1]);

	if ((infile = fopen(infile_name, "r")) == NULL) {
		fprintf(stderr, "Can't open %s\n", infile_name);
		exit(1);
	}
	linecount = 0;
	inbufptr = NULL;

	/* amalgamate command line params into a list of words */
	khead = NULL;
	for (i=3; i<=argc; i++) {
		ktemp = (struct kword *) malloc(sizeof(struct kword));
		ktemp->word = argv[i-1];
		ktemp->next = khead;
		khead = ktemp;
	}

	msrch_init(khead);	/* setup system - pass list of words */
	msrch_go(retrieve_char, found_word); /* actually search.  note technique of
											passing pointers to functions */
	msrch_end();		/* clean up */
}


/* get next character from input stream.  Routine returns either 
     (a) a character (as an int, and it must not have its sign extended), or
     (b) EOF */
int retrieve_char()
{
	int c;

	if (inbufptr == NULL || *(++inbufptr) == '\0') {	/* read a new line of data */
		if (fgets(inbuf, BUFSIZE, infile) == NULL) {
			fclose(infile);
			return(EOF);
		}
		inbufptr = inbuf;
		linecount++;
	}

	c = *inbufptr;
	c &= 0x00FF;	/* make sure it is not sign extended */
	return (c);
}

/* found_word: called by msch_go() when it finds a match */
void found_word(char *word)
{
	int i;

	fprintf(stdout, "Line %d\n%s", linecount, inbuf);

	i = (inbufptr-inbuf) - strlen(word) + 1;
	for(; i>0; i--)
		fputc(' ', stdout);

	fprintf(stdout, "%s\n\n", word);
}
#endif
