/*
 * Bawk regular expression compiler/interpreter
 */
#include <stdio.h>
#include "bawk.h"
 
re_compile( patbuf )
char	*patbuf;		/* where to put compiled pattern */
{
	/*
	 * Compile a regular expression from current input file
	 * into the given pattern buffer.
	 */
	int	c,		/* Current character         */
		o;		/* Temp                      */
	char	*patptr,	/* destination string pntr   */
		*lp,		/* Last pattern pointer      */
		*spp,		/* Save beginning of pattern */
		delim,		/* pattern delimiter         */
		*cclass();	/* Compile class routine     */
 
	patptr = patbuf;
	delim = getcharacter();

	while ( (c = getcharacter()) != -1 && c != delim )
	{
		/*
		 * STAR, PLUS and MINUS are special.
		 */
		if (c == '*' || c == '+' || c == '-') {
			if (patptr == patbuf ||
				  (o=patptr[-1]) == BOL ||
				  o == EOL ||
				  o == STAR ||
				  o == PLUS ||
				  o == MINUS)
				error( "illegal occurrance op", RE_ERROR );
			*patptr++ = ENDPAT;
			*patptr++ = ENDPAT;
			spp = patptr;		/* Save pattern end     */
			while (--patptr > lp)	/* Move pattern down... */
				*patptr = patptr[-1];	/* one byte     */
			*patptr =   (c == '*') ? STAR :
				(c == '-') ? MINUS : PLUS;
			patptr = spp;		/* Restore pattern end  */
			continue;
		}
		/*
		 * All the rest.
		 */
		lp = patptr;			/* Remember start       */
		switch(c) {
 
		case '^':
			*patptr++ = BOL;
			break;
 
		case '$':
			*patptr++ = EOL;
			break;
 
		case '.':
			*patptr++ = ANY;
			break;
 
		case '[':
			patptr = cclass( patptr );
			break;
 
		case ':':
			if ( (c=getcharacter()) != -1 )
			{
				switch( tolower( c ) )
				{
 
				case 'a':
					*patptr++ = ALPHA;
					break;
 
				case 'd':
					*patptr++ = DIGIT;
					break;
 
				case 'n':
					*patptr++ = NALPHA;
					break;
 
				case ' ':
					*patptr++ = PUNCT;
					break;
 
				default:
					error( "unknown ':' type", RE_ERROR );
 
				}
			}
			else
				error( "no ':' type", RE_ERROR );
 			break;

		case '\\':
			c = getcharacter();
 
		default:
			*patptr++ = CHAR;
			*patptr++ = c;
		}
	}
	*patptr++ = ENDPAT;
	*patptr++ = 0;			/* Terminate string     */

#ifdef DEBUG
	if ( Debug>1 )
	{
		for ( lp=patbuf; lp<patptr; ++lp )
		{
			switch ( c = *lp )
			{
			case CHAR:	printf("char "); break;
			case BOL:	printf("bol "); break;
			case EOL:	printf("eol "); break;
			case ANY:	printf("any "); break;
			case CLASS:	printf("class(%d) ", *++lp); break;
			case NCLASS:	printf("notclass(%d) ",*++lp); break;
			case STAR:	printf("star "); break;
			case PLUS:	printf("plus "); break;
			case MINUS:	printf("minus "); break;
			case ALPHA:	printf("alpha "); break;
			case DIGIT:	printf("digit "); break;
			case NALPHA:	printf("notalpha "); break;
			case PUNCT:	printf("punct "); break;
			case RANGE:	printf("range "); break;
			case ENDPAT:	printf("endpat "); break;
			default:	printf("<%c> ", c); break;
			}
		}
		printf( "\n" );
	}
#endif

	return patptr - patbuf;
}

char *
cclass( patbuf )
char	*patbuf;	/* destination pattern buffer */
{
	/*
	 * Compile a class (within [])
	 */
	char	*patptr,	/* destination pattern pointer */
		*cp;		/* Pattern start     */
	int	c,		/* Current character */
		o;		/* Temp              */

	patptr = patbuf;

	if ( (c = getcharacter()) == -1 )
		error( "class terminates badly", RE_ERROR );
	else if ( c == '^')
	{
		/*
		 * Class exclusion, for example: [^abc]
		 * Swallow the "^" and set token type to class exclusion.
		 */
		o = NCLASS;
	}
	else
	{
		/*
		 * Normal class, for example: [abc]
		 * push back the character and set token type to class
		 */
		ungetcharacter( c );
		o = CLASS;
	}
	*patptr++ = o;

	cp = patptr;	/* remember where byte count is */
	*patptr++ = 0;	/* and initialize byte count */
	while ( (c = getcharacter()) != -1 && c!=']' )
	{
		o = getcharacter();		/* peek at next char */
		if (c == '\\')			/* Store quoted chars */
		{
			if ( o == -1) /* Gotta get something */
				error( "class terminates badly", RE_ERROR );
			*patptr++ = o;
		}
		else if ( c=='-' && (patptr-cp)>1 && o!=']' && o != -1 )
		{
			c = patptr[-1];		/* Range start     */
			patptr[-1] = RANGE;	/* Range signal    */
			*patptr++ = c;		/* Re-store start  */
			*patptr++ = o;		/* Store end char  */
		}
		else
		{
			*patptr++ = c;		/* Store normal char */
			ungetcharacter( o );
		}
	}
	if (c != ']')
		error( "unterminated class", RE_ERROR );
	if ( (c = (patptr - cp)) >= 256 )
		error( "class too large", RE_ERROR );
	if ( c == 0 )
		error( "empty class", RE_ERROR );
	*cp = c;		/* fill in byte count */

	return patptr;
}
 
match( line, pattern )
char	*line;		/* line to match */
char	*pattern;	/* pattern to match */
{
	/*
	 * Match the current line (in Linebuf[]), return 1 if it does.
	 */
	char	*l;		/* Line pointer       */
	char	*pmatch();
	char	*next;
	int	matches;
 
	matches = 0;
	for (l = line; *l; l++)
	{
		if ( next = pmatch(line, l, pattern) )
		{
			l = next - 1;
			++matches;
#ifdef DEBUG
			if ( Debug )
				printf( "match!\n" );
#endif
		}
	}

	return matches;
}
 
char *
pmatch(linestart, line, pattern)
char	*linestart;	/* start of line to match */
char	*line;		/* (partial) line to match      */
char	*pattern;	/* (partial) pattern to match   */
{
	char	*l;	/* Current line pointer         */
	char	*p;	/* Current pattern pointer      */
	char	c;	/* Current character            */
	char	*e;	/* End for STAR and PLUS match  */
	int	op;	/* Pattern operation            */
	int	n;	/* Class counter                */
	char	*are;	/* Start of STAR match          */
 
	l = line;

#ifdef DEBUG
	if (Debug > 1)
		printf("pmatch(\"%s\")\n", line);
#endif

	p = pattern;
	while ((op = *p++) != ENDPAT) {

#ifdef DEBUG
		if (Debug > 1)
			printf("byte[%d] = 0%o, '%c', op = 0%o\n",
					l-line, *l, *l, op);
#endif

		switch(op) {
 
		case CHAR:
			if ( *l++ != *p++)
				return 0;
			break;
 
		case BOL:
			if (l != linestart)
				return 0;
			break;
 
		case EOL:
			if (*l != '\0')
				return 0;
			break;
 
		case ANY:
			if (*l++ == '\0')
				return 0;
			break;
 
		case DIGIT:
			if ((c = *l++) < '0' || (c > '9'))
				return 0;
			break;
 
		case ALPHA:
			c = tolower( *l++ );
			if (c < 'a' || c > 'z')
				return 0;
			break;
 
		case NALPHA:
			c = tolower(*l++);
			if (c >= 'a' && c <= 'z')
				break;
			else if (c < '0' || c > '9')
				return 0;
			break;
 
		case PUNCT:
			c = *l++;
			if (c == 0 || c > ' ')
				return 0;
			break;
 
		case CLASS:
		case NCLASS:
			c = *l++;
			n = *p++ & 0377;
			do {
				if (*p == RANGE) {
					p += 3;
					n -= 2;
					if (c >= p[-2] && c <= p[-1])
						break;
				}
				else if (c == *p++)
					break;
			} while (--n > 1);
			if ((op == CLASS) == (n <= 1))
				return 0;
			if (op == CLASS)
				p += n - 2;
			break;
 
		case MINUS:
			e = pmatch(linestart,l,p);/* Look for a match    */
			while (*p++ != ENDPAT);	/* Skip over pattern   */
			if (e)			/* Got a match?        */
				l = e;		/* Yes, update string  */
			break;			/* Always succeeds     */
 
		case PLUS:			/* One or more ...     */
			if ((l = pmatch(linestart,l,p)) == 0)
				return 0;	/* Gotta have a match  */
		case STAR:			/* Zero or more ...    */
			are = l;		/* Remember line start */
			while (*l && (e = pmatch(linestart,l,p)))
				l = e;		/* Get longest match   */
			while (*p++ != ENDPAT);	/* Skip over pattern   */
			while (l >= are) {	/* Try to match rest   */
				if (e = pmatch(linestart,l,p))
					return e;
				--l;		/* Nope, try earlier   */
			}
			return 0;		/* Nothing else worked */
 
		default:
			fprintf( stderr, "bad op code %d\n", op );
			error( "can't happen -- match", RE_ERROR );
		}
	}
	return l;
}

