/* SGML_stream.c
 * $Id: SGMLstream.c,v 1.3 93/01/06 18:40:28 connolly Exp Locker: connolly $
 */

/* implements... */
#include "SGML.h"

/* uses ... */
#include "object.h"
#include <ctype.h>
#include <assert.h>
#include <string.h>


VOID
  SGML_parseInstance(stream, getch, document, docclass)
HMStream stream;
HMGetcProc *getch;
HMDoc* document;
CONST HMDoc_Class *docclass;
{
  static char RE[] = "\n";
  char REbuffer[1 + SGML_LITLEN + SGML_NAMELEN + 4];
  char *buffer = REbuffer + 1;
  int content = SGML_MIXED;
  int lookahead = EOF;
  int len, read;
  char gi[SGML_NAMELEN+1];
  HMBinding attributes[SGML_ATTCNT];
  int attrqty;
  char eat_next_RE = 1, RE_pending = 0;

  REbuffer[0] = '\n'; /*@@ should be 13, not 10! */

  while( (read = SGML_read(stream, getch, buffer, sizeof(REbuffer) - 2,
			   content, &lookahead)) != EOF){
    switch(read){
    case SGML_start_tag:
      if (RE_pending){
	(docclass->data)(document, RE, 1);
      }

      len = SGML_read_name(stream, getch, gi, &lookahead);
      gi[len] = 0;

      attrqty = 0;
      while(isalpha(lookahead)){ /* iterate over attributes */
	len = SGML_read_name(stream, getch,
			     buffer, &lookahead);
	buffer[len] = 0;

	if(lookahead == '='){
	  int offset = len + 1;
	  HMBinding* attr = &attributes[attrqty++];

	  lookahead = EOF;
	  /* @@ entity references in attribute value */
	  len += SGML_read_value(stream,
				 getch,
				 buffer + offset,
				 &lookahead) + 1;
	  buffer[len++] = '\0';
	  attr->name = NEW(char, len);
	  memcpy(attr->name, buffer, len);
	  attr->value = attr->name + offset;
	}
      }

      /* look for tag close */
      while(isspace(lookahead))
	lookahead = (getch)(stream);
      lookahead = EOF;

      {
	int i;
	int c;
      
	c = (docclass->startTag)(document, gi, attributes, attrqty);

	if(c == SGML_EMPTY){
	  eat_next_RE = 0;
	}else{
	  content = c;
	  eat_next_RE = 1;
	}

	for(i=0; i<attrqty; i++)
	  FREE(attributes[i].name);
      }
      RE_pending = 0;
      break;

    case SGML_end_tag:
      /* drop pending RE */

      len = SGML_read_name(stream, getch, gi, &lookahead);
      gi[len] = 0;

      /* look for tag close */
      while(isspace(lookahead))
	lookahead = (getch)(stream);
      lookahead = EOF;

      (docclass->endTag)(document, gi);
      content = SGML_MIXED; /* @@ could be element */
      eat_next_RE = 0;
      RE_pending = 0;
      break;

    case SGML_entity:
      if (RE_pending){
	(docclass->data)(document, RE, 1);
      }
      eat_next_RE = 0;
      RE_pending = 0;

      {
	CONST char* text = (docclass->entityText)(document, buffer);

	if(text)
	  (docclass->data)(document, text, strlen(text));
      }
      break;

    case SGML_record_end:
      if(eat_next_RE){
	eat_next_RE = 0;
	RE_pending = 0;
      }
      else if (RE_pending){
	(docclass->data)(document, RE, 1);
      }
      else
	RE_pending = 1;

      break;

    default:
      buffer[read] = 0;
      if(RE_pending)
	(docclass->data)(document, REbuffer, read + 1);
      else
	(docclass->data)(document, buffer, read);
      RE_pending = 0;
      eat_next_RE = 0;
      break;
    }
  }while(read != EOF);
}


/*****
 * lexical analysis
 *****/

int
  SGML_read(stream, getch,
	    buf, nbytes,
	    content,
	    inout_lookahead)
HMStream stream;
HMGetcProc* getch;
char* buf;
int nbytes;
int content;
int* inout_lookahead;
{
  int c; /* state machine input character */
  enum { /* state machine states */
    start, data, cdata, rcdata, pcdata,
    and, and_hash, cref, entity,
    lt, lt_slash, tag,
    pi,
    lt_bang, lt_bang_dash,
    comment, comment_dash, ps
  } state = start;
  /* auxiliary state: */
  int end_tag; /* saw '/' after '<' */
  char name[SGML_NAMELEN + 1]; /* function character name */
  int name_chars;

  int ret = 0; /* number of characters read */

#define LOOKAHEAD(n) (ret + n < nbytes)
#define REDUCE(s) { state = (s); break; }
#define SHIFT(s) { state = (s); continue; }
#define DONE(c) { *inout_lookahead = (c); return ret; }
#define WRITE(c) { *buf++ = (c); ret++; }

  /* prime the pump */
  if((c  = *inout_lookahead) == EOF)
    c = (getch)(stream);

  /* state machine...*/
  while(ret < nbytes){

    switch(state){

    case start:
      if(c == EOF) return EOF;
      else if(c == '\n') { ret = SGML_record_end; DONE(EOF); }
      else if(c == '<'){
	if(LOOKAHEAD(3)) { REDUCE(lt); }
	else { DONE(c); } /* no room for lookahead */
      }else if(c == '&'){
	if(LOOKAHEAD(2)) { REDUCE(and); }
      }else if(content == SGML_ELEMENT && isspace(c)){
	break; /* ignore whitespace in ELEMENT content */
      }else { SHIFT(data); }

    case data:
      if(content == SGML_ELEMENT){
	if(isspace(c)){
	  break;
	}else{
	  *buf = 0; ret = 0; DONE(c);
	}
      }else if(content == SGML_CDATA){ SHIFT(cdata); }
      else if(content == SGML_RCDATA){ SHIFT(rcdata); }
      else /* assume SGML_MIXED */ { SHIFT(pcdata); }

    case cdata:
      if(c == EOF || c == '<' || c == '\n') { DONE(c); }
      else{ WRITE(c);	break; }

    case rcdata:
    case pcdata:
      if(c == EOF || c == '<' || c == '&' || c == '\n') { DONE(c); }
      else{ WRITE(c); break; }

    case and:
      if(c == '#') { REDUCE(and_hash); }
      else if(isalpha(c)) {
	if(LOOKAHEAD(SGML_NAMELEN+1)){
	  name_chars = 0; SHIFT(entity);
	}else{
	  DONE(c); /* error: no room for entity name */
	}
      }
      else{ WRITE('&');	SHIFT(data); }

    case entity:
      if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
	WRITE(c);
	break;
      }
      else{
	WRITE('\0');
	ret = SGML_entity;
	if(c == ';' || c == '\n'){  DONE(EOF); /* eat ; */ }
	else{ DONE(c); /* ended char ref with other char */ }
      }

    case and_hash:
      if(isalnum(c)){ name_chars = 0; SHIFT(cref); }
      else{ WRITE('&'); WRITE('#'); SHIFT(data); }

    case cref:
      /* auxiliary state: name_chars */
      if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
	if(name_chars < SGML_NAMELEN)
	  name[name_chars++] = c;
	/* else markup error: name too long */
	break;
      }
      else{
	int nc = 0;

	name[name_chars] = '\0';
	if(isdigit(name[0])){
	  nc = atoi(name);
	}else if(!strcmp(name, "SPACE")){
	  nc = 32;
	}else if(!strcmp(name, "RS")){
	  nc = 10;
	}else if(!strcmp(name, "RE")){
	  nc = 13;
	}

	if(nc) WRITE(nc); /* else error: bad character reference */

	if(c == ';') { REDUCE(data); }
	else
	  /* terminate entity reference w/space or something */
	  { SHIFT(data); }
      }

    case lt:
      if(c == '/') { REDUCE(lt_slash); }
      if(content == SGML_MIXED || content == SGML_ELEMENT){
	if(c == '?') { REDUCE(pi); }
	else if(c == '!') { REDUCE(lt_bang); }
	else if(isalpha(c)) { end_tag = 0; SHIFT(tag); }
      }
      WRITE('<'); SHIFT(data);

    case lt_slash:
      if(isalpha(c)) { end_tag = 1; SHIFT(tag); }
      else { WRITE('<'); WRITE('/'); SHIFT(data); }

    case tag:
      /* auxiliary state: end_tag */
      ret = end_tag ?  SGML_end_tag : SGML_start_tag;
      DONE(c);

    case pi: /* processing instruction (or markup declaraion) */
      if(c == '>') { REDUCE(start); }
      else if(c == EOF) { SHIFT(start); } /* error: EOF in pic */
      else break;

    case lt_bang:
      if(c == '-') { REDUCE(lt_bang_dash); }
      /*
       * *** NON CONFORMING IMPLEMENTATION ***
       * a letter here starts a markup declaration, which isn't supported
       * a [ starts a marked section, which isn't supported.
       * treat them like processing instructions.
       */
      else if(c == '[' || isalpha(c)) { REDUCE(pi); }
      else if(c == '>') { REDUCE(start); }
      else{ WRITE('<'); WRITE('!'); SHIFT(data); }

    case lt_bang_dash:
      if(c == '-') { REDUCE(comment); }
      else{ WRITE('<'); WRITE('!'); WRITE('-'); SHIFT(data); }

    case comment:
      if(c == '-') { REDUCE(comment_dash); }
      else if(c == EOF) { DONE(c); } /* error: eof in comment */
      else break;

    case comment_dash:
      if(c == '-') { REDUCE(ps); }
      else if(c == EOF) { DONE(c); }/* error: eof in comment */
      else { REDUCE(comment); }

    case ps: /* parameter separator between -- and > */
      if(c == EOF) { DONE(c); }
      else if(isspace(c)) break;
      else { REDUCE(start); }/* error if c !='>' */

    }
    c = (getch)(stream);
  }

  DONE(c); /* set up lookahead for next call */
#undef S
#undef LOOKAHEAD
#undef REDUCE
#undef SHIFT
#undef DONE
#undef WRITE
}


int
  SGML_read_name(stream, getch, buf, inout_lookahead)
HMStream stream;
HMGetcProc* getch;
char* buf;
int* inout_lookahead;
{
  int name_chars = 0;
  int c = *inout_lookahead;

  if(!isalpha(c)) return 0;

  do{
    if(name_chars <= SGML_NAMELEN)
      buf[name_chars++] = toupper(c);
    /* else error: name too long */
    c = (getch)(stream);
  }while(isalnum(c) || strchr(SGML_UCNMCHAR, c));

  while(isspace(c))
    c = (getch)(stream);

  *inout_lookahead = c;
  return name_chars;
}


int
  SGML_read_value (stream,
		   getch,
		   buf,
		   inout_lookahead)
HMStream stream;
HMGetcProc* getch;
char* buf;
int* inout_lookahead;
{

  int c; /* state machine input character */
  enum { /* state machine states */
    start,
    literal,
    and, and_hash, cref,
#if defined(SGML_SHORTTAG) || defined(GROK_UNQUOTED_LITERALS)
    value,
#endif
    ps
  } state = start;
  /* auxiliary state: */
  char quote; /* which kind of quote */

  int ret = 0; /* number of characters read */
  char name[SGML_NAMELEN + 1]; /* entity name */
  int name_chars;

#define LOOKAHEAD(n) (ret + n < SGML_LITLEN)
#define REDUCE(s) { state = (s); break; }
#define SHIFT(s) { state = (s); continue; }
#define DONE(c) { *inout_lookahead = (c); return ret; }
#define WRITE(c) { *buf++ = (c); ret++; }

  /* prime the pump */
  if((c  = *inout_lookahead) == EOF)
    c = (getch)(stream);
  
  /* state machine...*/
  while(ret < SGML_LITLEN){

    switch(state){

    case start:
      if(c == EOF) return EOF;
      else if(c == '"') { quote = c; REDUCE(literal); }
      else if(c == '\'') { quote = c; REDUCE(literal); }
      else if(isspace(c)) break;
#ifdef GROK_UNQUOTED_LITERALS
      else if(!(c == '>')){
	SHIFT(value);
      }
#else
#ifdef SGML_SHORTTAG
      else if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
	SHIFT(value);
      }
#else
      else { DONE(c); } /* error: illegal char in markup */
#endif
#endif

#ifdef GROK_UNQUOTED_LITERALS
    case value:
      if(c == EOF) { DONE(c); }
      else if(isspace(c) || c == '>'){ SHIFT(ps); }
      else{
	WRITE(c);
	break;
      }
#else
#ifdef SGML_SHORTTAG
    case value:
      if(c == EOF) { DONE(c); }
      else if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
	WRITE(c);
	break;
      }else{ SHIFT(ps); }
#endif
#endif

    case literal:
      if(c == EOF) { DONE(c); }
      else if(c == quote) { REDUCE(ps); }
      else if(c == '&'){ REDUCE(and); }
      else if(c == '\n' || c == '\t'){ WRITE(' '); break; }
      else{
	WRITE(c);
	break;
      }

    case and:
      if(c == '#') { REDUCE(and_hash); }
      /*@@ else if(isalpha(c)) ... process entity reference */
      else{ WRITE('&');	SHIFT(literal); }

    case and_hash:
      if(isalnum(c)){ name_chars = 0; SHIFT(cref); }
      else{ WRITE('&'); WRITE('#'); SHIFT(literal); }

    case cref:
      /*@@ in case of &#13xyz, this throws out xyz as error, when
	it should only throw out x */
      if(isdigit(c) || isalpha(c)
	 || strchr(SGML_UCNMCHAR, c)){
	if(name_chars < SGML_NAMELEN)
	  name[name_chars++] = c;
	/* else markup error: name too long */
	break;
      }
      else{
	int nc = 0;

	name[name_chars] = '\0';
	if(isdigit(name[0])){
	  nc = atoi(name);
	}else if(!strcmp(name, "SPACE")){
	  nc = 32;
	}else if(!strcmp(name, "RS")){
	  nc = 10;
	}else if(!strcmp(name, "RE")){
	  nc = 13;
	}else
	  break;

	if(nc) WRITE(nc); /* else error: bad character reference */

	if(c == ';') { REDUCE(literal); }
	else
	  /* terminate entity reference w/space or something */
	  { SHIFT(literal); }
      }

    case ps: /* parameter separator between attributes */
      if(isspace(c)) break;
      else { DONE(c); }

    }
    c = (getch)(stream);
  }

  /* error: attribute value too long */

  DONE(EOF); /* set lookahead to EOF for next call */
#undef S
#undef LOOKAHEAD
#undef REDUCE
#undef SHIFT
#undef DONE
#undef WRITE
}
