/* National Institute of Standards and Technology (NIST)
/* National Computer System Laboratory (NCSL)
/* Office Systems Engineering (OSE) Group
/* ********************************************************************
/*                            D I S C L A I M E R
/*                              (March 8, 1989)
/*  
/* There is no warranty for the NIST NCSL OSE SGML parser and/or the NIST
/* NCSL OSE SGML parser validation suite.  If the SGML parser and/or
/* validation suite is modified by someone else and passed on, NIST wants
/* the parser's recipients to know that what they have is not what NIST
/* distributed, so that any problems introduced by others will not
/* reflect on our reputation.
/* 
/* Policies
/* 
/* 1. Anyone may copy and distribute verbatim copies of the SGML source
/* code as received in any medium.
/* 
/* 2. Anyone may modify your copy or copies of SGML parser source code or
/* any portion of it, and copy and distribute such modifications provided
/* that all modifications are clearly associated with the entity that
/* performs the modifications.
/* 
/* NO WARRANTY
/* ===========
/* 
/* NIST PROVIDES ABSOLUTELY NO WARRANTY.  THE SGML PARSER AND VALIDATION
/* SUITE ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
/* EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
/* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
/* THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS
/* WITH YOU.  SHOULD THE SGML PARSER OR VALIDATION SUITE PROVE DEFECTIVE,
/* YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
/* 
/* IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL NIST BE LIABLE FOR
/* DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR OTHER SPECIAL,
/* INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR
/* INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA
/* BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR A
/* FAILURE OF THE PROGRAM TO OPERATE WITH PROGRAMS NOT DISTRIBUTED BY
/* NIST) THE PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF
/* SUCH DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
*/

/************************************************************************/
/*   TITLE:          SGML PARSER                                        */
/*   SYSTEM:         DOCUMENT PROCESSOR                                 */
/*   SUBSYSTEM:                                                         */
/*   SOURCE FILE:    DIGETS.C                                           */
/*   AUTHOR:         Steven Lindeman, Fred Maples                       */
/*                                                                      */
/*   DATE CREATED:                                                      */
/*   LAST MODIFIED:                                                     */
/*                                                                      */
/*                  REVISIONS                                           */
/*   WHEN      WHO            WHY                                       */
/************************************************************************/
#include <stdio.h>
#include <search.h>
#include <ctype.h>
#include "didefs.h"
#include "diglobal.h"

/*------------------------------------------------------*/
/*                   G E T D E L I M                     */
/*                   */
/*        Called by:   GETTOKEN           */
/*                   */
/*        Returns:     ETAGO, STAGO, PIO, MDO,     */
/*           DELIM, or EOF       */
/*                   */
/*             reads a delimeter from input           */
/*------------------------------------------------------*/
getdelim()
{
   register int inchar,retval;
   switch(inchar=our_fgetc(indoc)) {
   case '<':
      switch(inchar=our_fgetc(indoc)) {
      case '/':        /* found immediately after TAGO, */
         if (isalpha(inchar=our_fgetc(indoc))) {
            our_ungetc(inchar,indoc);
            putstr_outbuf("\n[/");  /* so it's an endtag */
            retval = ETAGO;
         }
         else {
            our_ungetc(inchar,indoc);
            unget_string("</");
            retval = NODELIM;
         }
         break;
      case '?':        /* found a processing instruction */
         putstr_outbuf("\n[?");
         retval = PIO;
         break;
      case '!':  /* found a markup declaration open */
         if ((inchar=our_fgetc(indoc)) == '-')
            if ((inchar=our_fgetc(indoc)) == '-') {
               unget_string("--");
               putstr_outbuf("<!");
               retval = MDO;
            }
            else {
               our_ungetc('-',indoc);
               our_ungetc(inchar,indoc);
               retval = NODELIM;
            }
         else
            if (isalpha(inchar) || inchar=='[' || inchar==MARKUP_END) {
               our_ungetc(inchar,indoc);
               putstr_outbuf("<!");
               retval = MDO;
            }
            else {
               our_ungetc(inchar,indoc);
               retval = NODELIM;
            }
         break;
      default:
         if (isalpha(inchar)) {
            our_ungetc(inchar,indoc);
            putstr_outbuf("\n[");
            retval = STAGO;
         }
         else {
            our_ungetc(inchar,indoc);
            our_ungetc('<',indoc);
            retval = NODELIM;
         }
         break;
      }
      break;
   case EOF:
      retval = EOF;  /* no more data */
      break;
   default:
      our_ungetc(inchar,indoc);
      retval = NODELIM;  /* no delimeter was found, probably data */
      break;
   }
   return(retval);
}

/*------------------------------------------------------*/
/*          G E T C D A T A         */
/*      This routine reads character data from 'indoc'.  */
/* CDATA is terminated by an etago, delimiter   */
/* in context.  That is a '</' followed by a name  */
/* start character.           */
/*                      */
/*              returns -- NFDHT, FOUND            */
/*------------------------------------------------------*/
STATUS getcdata()
{
   int inchar;
   unsigned num_cr;
   char *outstr;
   STATUS retval;
   register BOOLEAN more_cdata,cr_found;
   BOOLEAN firsttime;

   retval = NFDHT;
   outstr = get_char_mem(2);
   flush_buf();    
   more_cdata = firsttime = TRUE;
   num_cr = 0;

   while(more_cdata==TRUE && (inchar=our_fgetc(indoc))!=EOF) {
      cr_found = save_crs(&num_cr,&inchar);
      if (inchar == '<')
         if ((inchar=our_fgetc(indoc)) == '/')
            if (isalpha(inchar=our_fgetc(indoc))) {
               more_cdata = FALSE;
               our_ungetc(inchar,indoc);  /* unget in reverse order */
               unget_string("</");
            }
            else {
               retval = FOUND;
               check_cr(&num_cr,cr_found,&firsttime,FALSE);
               (*print_ctr)(ctrfp,"</");
               (*applic)(DATA_STG,"</","");
               our_ungetc(inchar,indoc);
            }
         else {
            retval = FOUND;
            check_cr(&num_cr,cr_found,&firsttime,FALSE);
            (*put_ctr)('<',ctrfp);
            (*applic)(DATA_STG,"<","");
            our_ungetc(inchar,indoc);
         }
      else
         if (inchar == OUR_EE) {
            if (entstack[--entitylevel] != lookstack())
               ourexit(2,"\nError: Entity End occurred in different character data.\n");
         }
         else {
            retval = FOUND;
            check_cr(&num_cr,cr_found,&firsttime,FALSE);
            *outstr = inchar;
            (*applic)(DATA_STG,outstr,"");
            (*put_ctr)(inchar,ctrfp);
         }
   }
   if (retval == FOUND)
      (*put_ctr)('|',ctrfp);
   free(outstr);
   return(retval); 
}

/*------------------------------------------------------*/
/*                  G E T P C D A T A             */
/* This routine reads parsable character data   */
/* from 'indoc'.  PCDATA is terminated by an */
/* etago, delimiter in context, given that all  */
/* entities have been closed.  That is, a '</'  */
/* followed by a name start character.  General */
/* entities, as well as numeric and named char- */
/* acter references are resolved.         */
/*                      */
/*              returns -- NFDHT, FOUND            */
/*------------------------------------------------------*/
STATUS getpcdata(genthead,penthead)
ENTITYDESC *genthead,*penthead;
{
   int inchar,token,token2;
   unsigned num_cr;
   char *outstr;
   BOOLEAN more_pcdata,more_subdata,cr_found,pcdata_ft;
   STENTRY *tp;
   TKNRETVAL tknretval;
   STATUS retval;
   TNODE *newcm;

   flush_buf();
   outstr = get_char_mem(2);
   num_cr = 0;
   retval = NFDHT;
   tknretval = TEXT;  /* so will read all the data first */
   our_ungetc(inchar=our_fgetc(indoc),indoc);  /* initialize inchar */
   token = -1;
   tp = NULL; /* just to satisfy lint */
   more_pcdata = pcdata_ft = TRUE;

   while(more_pcdata && inchar!=EOF) {
      if (open_rcdata_ms)
         retval = getrcdata(genthead,FALSE,&pcdata_ft,FALSE);
      else
         if (open_cdata_ms)
            retval = get_cdata_ms(&pcdata_ft);
         else
            if (tknretval == TEXT) {
               more_subdata = TRUE;
               while(more_subdata && (inchar=our_fgetc(indoc))!=EOF) {
                  cr_found = save_crs(&num_cr,&inchar);
                  if (inchar == EOF)
                     more_subdata = pcdata_ft = FALSE;
                  else
                     if (inchar == ']')
                        if ((inchar=our_fgetc(indoc)) == ']')
                           if ((inchar=our_fgetc(indoc)) == MARKUP_END) {
                              if (--num_open_ms < 0)
                                 ourexit(2,"\nError: Marked section end outside of declaration.\n");
                           }
                           else {
                              retval = FOUND;
                              check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
                              our_ungetc(inchar,indoc);
                              (*applic)(DATA_STG,"]]","");
                              (*print_ctr)(ctrfp,"]]");
                           }
                        else {
                           retval = FOUND;
                           check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
                           our_ungetc(inchar,indoc);
                           (*put_ctr)(']',ctrfp);
                           (*applic)(DATA_STG,"]","");
                        }
                     else
                        if (inchar == '<')
                           if ((inchar=our_fgetc(indoc)) == '/')
                              if (isalpha(inchar=our_fgetc(indoc))) {
                                 more_subdata = pcdata_ft = FALSE;
                                 our_ungetc(inchar,indoc);
                                 unget_string("</");
                              }
                              else {
                                 retval = FOUND;
                                 check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
                                 (*print_ctr)(ctrfp,"</");
                                 (*applic)(DATA_STG,"</","");
                                 our_ungetc(inchar,indoc);
                              }
                           else  /* found markup */
                           if (inchar=='?' || isalpha(inchar)) {
                              more_subdata = pcdata_ft = FALSE;
                              our_ungetc(inchar,indoc);
                              our_ungetc('<',indoc);
                           }
                           else
                              if (inchar == '!')
                                 retval = check_for_mdo(&more_subdata,&num_cr,cr_found,&pcdata_ft);
                              else {
                                 retval = FOUND;
                                 check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
                                 (*put_ctr)('<',ctrfp);
                                 (*applic)(DATA_STG,"<","");
                                 our_ungetc(inchar,indoc);
                              }
                        else {
                           retval = FOUND;
                           check_cr(&num_cr,cr_found,&pcdata_ft,FALSE);
                           try_entref(inchar,genthead,FALSE,&pcdata_ft);
                        }
               }
               check_cr(&num_cr,cr_found,&pcdata_ft,TRUE);
            }
            else
               if (find_except(currincl,token) && !find_except(currexcl,token)) {
                  newcm = pushcreate(tp);
                  if ((retval = traverse(newcm,tp,genthead,penthead,&pcdata_ft)) == NFSH) {
                     if (tknretval == TEXT)
                        sprintf(error_msg,"%s%s%s","\nError: Invalid data, last opened element '",tp->nametoken,"'.\n");
                     else
                        sprintf(error_msg,"%s%s%s","\nError: Invalid tag, last opened element '",tp->nametoken,"'.\n");
                     FATAL_ERROR()
                  }

                  /* check to make sure the element has content */
                  if (EMPTY_CONTENT(newcm)) {   /* can't have endtag for EMPTY */
                     token2 = token | HIGHBIT;
                     putstr_outbuf("\n[/");
                     putstr_outbuf(tp->nametoken);
                     putstr_outbuf("]");
                     place_in_queue(END_TAG_NAME,tp->nametoken,"");
                  }
                  else
                     tknretval = gettoken(&tp,&token2,genthead,penthead,&pcdata_ft);  /* must be end tag */
                  if (IS_STARTTAG(token2) || IS_ENDTAG_NOTEQ(token2,token))
                     resolve_endtag(tp->cmptr,token2,tp,&retval,tknretval,genthead,penthead,token);
                  popfree(newcm);  /* through with this content model */
               }
               else {
                  more_pcdata = FALSE;
                  ungettoken(token,tp);
               }
      if (more_pcdata)
         tknretval = gettoken(&tp,&token,genthead,penthead,&pcdata_ft);
   }
   if (retval==FOUND && !cr_found)
      (*put_ctr)('|',ctrfp);
   free(outstr);
   return(retval);
}

/*------------------------------------------------------*/
/*                  G E T R C D A T A        */
/*      Reads 'indoc' for replaceable character data. */
/* Entity references are resolved normally.  */
/* RCDATA is terminated by an etago, delimiter  */
/* in context.  That is a '</' followed by a name  */
/* start character.           */
/*                   */
/*          returns -- NFDHT, FOUND          */
/*------------------------------------------------------*/
STATUS getrcdata(genthead,look_for_endtag,firsttime,end_of_data)
ENTITYDESC *genthead;
BOOLEAN look_for_endtag,*firsttime,end_of_data;
{
   int inchar,
       prev_entitylevel;
   unsigned num_cr;
   BOOLEAN 
       more_rcdata,
       same_entity,
       cr_found,
       rcdata_ft;
   STATUS retval;

   more_rcdata = same_entity = TRUE;
   flush_buf();
   retval = NFDHT;
   num_cr = 0;
   rcdata_ft = *firsttime;

   while(more_rcdata && (inchar=our_fgetc(indoc))!=EOF) {
      cr_found = save_crs(&num_cr,&inchar);
      if (inchar=='<' && look_for_endtag)
         if ((inchar=our_fgetc(indoc)) == '/')
            if (isalpha(inchar=our_fgetc(indoc)) && same_entity) {
               more_rcdata = FALSE;
               our_ungetc(inchar,indoc);  /* unget in reverse order */
               unget_string("</"); 
            }
            else {
               retval = FOUND;
               check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
               (*print_ctr)(ctrfp,"</");
               (*applic)(DATA_STG,"</","");
               our_ungetc(inchar,indoc);
            }
         else {
            retval = FOUND;
            check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
            (*put_ctr)('<',ctrfp);
            (*applic)(DATA_STG,"<","");
            our_ungetc(inchar,indoc);
         }
      else
         if (inchar==']' && !look_for_endtag)
            if ((inchar=our_fgetc(indoc)) == ']')
               if ((inchar=our_fgetc(indoc))==MARKUP_END && same_entity) {
                  more_rcdata = open_rcdata_ms = FALSE;
                  unget_string("]]>");
               }
               else {
                  retval = FOUND;
                  check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
                  (*print_ctr)(ctrfp,"]]");
                  (*applic)(DATA_STG,"]]","");
                  our_ungetc(inchar,indoc);
               }
            else {
               retval = FOUND;
               check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
               (*print_ctr)(ctrfp,"]");
               (*applic)(DATA_STG,"]","");
               our_ungetc(inchar,indoc);
            }
         else
            if (inchar == OUR_EE) {
               if (entstack[--entitylevel] != lookstack())
                  ourexit(2,"\nError: Entity End occurred in different replaceable character data.\n");
               check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
               same_entity = TRUE;
            }
            else {
               retval = FOUND;
               check_cr(&num_cr,cr_found,&rcdata_ft,FALSE);
               prev_entitylevel = entitylevel;
               try_entref(inchar,genthead,FALSE,&rcdata_ft);
               same_entity = (prev_entitylevel == entitylevel) ? TRUE : FALSE;
            }
   }
   if (retval==FOUND && end_of_data)
      (*put_ctr)('|',ctrfp);
   return(retval); 
}

/*------------------------------------------------------*/
/*         G E T _ M A R K E D _ S E C T I O N          */
/*     This routine processes a marked section.  If     */
/*     the section is an INCLUDE section, processing  */
/*     is returned to gettoken, else the entire section  */
/*     is processed and then control is returned.  */
/*------------------------------------------------------*/
void get_marked_section(penthead)
ENTITYDESC *penthead;
{
   register int inchar,statkey;
   int begnum_open;
   BOOLEAN moredata,close_read=FALSE;

   if ((inchar=our_fgetc(indoc)) != '[')
      ourexit(2,"\nError: DSO not found in marked section.\n");

   statkey = get_status_keyword(penthead);
   while(inputps(penthead) > 0)
      gettilnosep();
   if ((inchar=our_fgetc(indoc)) != '[')
      ourexit(2,"\nError: DSO not found in marked section.\n");

   switch(statkey) {
   case MS_INCLUDE:
      break;
   case MS_CDATA:
      open_cdata_ms = TRUE;
      break;
   case MS_RCDATA:
      open_rcdata_ms = TRUE;
      break;
   case MS_IGNORE:
      begnum_open = num_open_ms-1;   /* already incremented */
      moredata = TRUE;
      while(moredata && (inchar=our_fgetc(indoc))!=EOF)
         if (inchar=='<' && (inchar=our_fgetc(indoc))=='!' && (inchar=our_fgetc(indoc))=='[') {
            if (++num_open_ms > TAGLVL)
               ourexit(2,"\nError: Number open marked sections > TAGLVL.\n");
         }
         else
            if (inchar==']' && (inchar=our_fgetc(indoc))==']' && (inchar=our_fgetc(indoc))==MARKUP_END) {
               if (--num_open_ms == begnum_open)
                  moredata = FALSE;
            }
            else
               if (inchar == OUR_EE)
                  ourexit(2,"\nError: Entity End found in IGNORE marked section.\n");
      STRIP_CRs();
      close_read = TRUE;
      break;
   default:
      software_fault();
      break;
   }
   if (!close_read && statkey!=MS_INCLUDE && statkey!=MS_CDATA && statkey!=MS_RCDATA) {
      if ((inchar=our_fgetc(indoc))!=']' || (inchar=our_fgetc(indoc))!=']')
         ourexit(2,"\nError: MDO not found in marked section.\n");
      if ((inchar=our_fgetc(indoc)) != MARKUP_END)
         ourexit(2,"\nError: MDC not found in marked section.\n");
   }
   return;
}

/*------------------------------------------------------*/
/*         G E T _ N A M E       */
/* Reads from the input document for a valid */
/* SGML name.  An error condition is raised if  */
/* the length of the name is greater than NAMELEN. */
/*------------------------------------------------------*/
get_name(name,capitalize)
char name[];
int (*capitalize)();
{
   int inchar,indx;

   memset(name,'\0',NAMELEN+1);
   indx=0;
   if (isalpha(inchar=our_fgetc(indoc))) {
      putchar_outbuf(name[indx++]=(*capitalize)(inchar));
      fillup(name,&indx,capitalize);
      if (indx > NAMELEN) {
         sprintf(error_msg,"%s%s%s","\nError: Length of name beginning '",name,"' > NAMELEN\n");
         FATAL_ERROR()
      }
   }
   else {
      name[indx++] = (*capitalize)(inchar);
      fillup(name,&indx,capitalize);
      sprintf(error_msg,"%s%s%s","\nError: Name '",name,"' must start with name start character\n");
      FATAL_ERROR()
   }
   return(indx);
}

/*------------------------------------------------------*/
/*------------------------------------------------------*/
/*------------------------------------------------------*/
get_entname(name,capitalize)
char name[];
int (*capitalize)();
{
   int inchar,indx;

   memset(name,'\0',NAMELEN+1);
   indx=0;
   if (isalpha(inchar=our_fgetc(indoc))) {
      name[indx++] = (*capitalize)(inchar);
      fillup2(name,&indx,capitalize);
      if (indx > NAMELEN) {
         sprintf(error_msg,"%s%s%s","\nError: Length of name beginning '",name,"' > NAMELEN\n");
         FATAL_ERROR()
      }
   }
   else {
      name[indx++] = (*capitalize)(inchar);
      fillup2(name,&indx,capitalize);
      sprintf(error_msg,"%s%s%s","\nError: Name '",name,"' must start with name start character\n");
      FATAL_ERROR()
   }
   return(indx);
}

/*------------------------------------------------------*/
/*      G E T _ N U T O K E N       */
/* Reads from the input document for a valid */
/* SGML nutoken.  An error condition is raised  */
/* if the length of the nutoken is greater than */
/* NAMELEN.             */
/*------------------------------------------------------*/
get_nutoken(nutoken,capitalize)
char nutoken[];
int (*capitalize)();
{
   int inchar,indx;

   memset(nutoken,'\0',NAMELEN+1);
   indx=0;
   inchar=our_fgetc(indoc);
   if (isdigit(inchar)) {      /* nutoken must start with numeral */
      putchar_outbuf(nutoken[indx++]=(*capitalize)(inchar));
      fillup(nutoken,&indx,capitalize);
      if (indx > NAMELEN) {
         sprintf(error_msg,"%s%s%s","\nError: Length of nutoken beginning '",nutoken,"' > NAMELEN\n");
         FATAL_ERROR()
      }
   }
   else {
      nutoken[indx++] = (*capitalize)(inchar);
      fillup(nutoken,&indx,capitalize);
      sprintf(error_msg,"%s%s%s","\nError: Nutoken '",nutoken,"' must start with numeral.\n");
      FATAL_ERROR()
   }
   return(indx);
}

/*------------------------------------------------------*/
/*      G E T _ N M T O K E N       */
/* Reads from the input document for a valid */
/* SGML nmtoken.  An error condition is raised  */
/* if the length of the nmtoken is greater than */
/* NAMELEN.             */
/*------------------------------------------------------*/
get_nmtoken(nmtoken,capitalize)
char nmtoken[];
int (*capitalize)();
{
   int indx=0;

   memset(nmtoken,'\0',NAMELEN+1);
   fillup(nmtoken,&indx,capitalize);
   if (indx > NAMELEN) {
      sprintf(error_msg,"%s%s%s","\nError: Length of nmtoken beginning '",nmtoken,"' > NAMELEN\n");
      FATAL_ERROR()
   }
   return(indx);
}

/*------------------------------------------------------*/
/*        G E T _ N U M B E R       */
/* Reads from the input document for a valid */
/* SGML number.  An error condition is raised   */
/* if the length of the number is greater than  */
/* NAMELEN.             */
/*------------------------------------------------------*/
get_number(number,capitalize)
char number[];
int (*capitalize)(); 
{
   int indx=0;
   memset(number,'\0',NAMELEN+1);
   while(isdigit(number[indx]=our_fgetc(indoc)) && indx<=NAMELEN)
      putchar_outbuf(number[indx++]);
   if (indx > NAMELEN) {
      sprintf(error_msg,"%s%s%s","\nError: Length of number beginning '",number,"' > NAMELEN.\n");
      FATAL_ERROR()
   }
   if (indx == 0) {
      sprintf(error_msg,"%s%s%s","\nError: Invalid number, found '",number,"'.\n");
      FATAL_ERROR()
   }
   our_ungetc(number[indx],indoc);
   return(indx);
}

/*------------------------------------------------------*/
/*               G E T _ P I        */
/* This routine reads a processing instruction  */
/* from 'indoc'.   No parsing is done on the p.i.  */
/* The p.i. is terminated by a TAGC.  The output   */
/* buffer has already been flushed, therefore   */
/* the p.i. is dumped directly into 'outdoc'.   */
/*------------------------------------------------------*/
void get_pi()
{
   register int inchar,pi_length;
   char outpi[PILEN+1];

   pi_length = 0;
   while((inchar=our_fgetc(indoc))!=MARKUP_END && inchar!=EOF && PILEN>pi_length) {
      if (inchar == OUR_EE)
         ourexit(2,"\nError: EE is invalid in processing instruction.\n");
      else
         (*put_ctr)(inchar,ctrfp);
      outpi[pi_length++] = inchar;
   }
   (*put_ctr)(']',ctrfp);
   STRIP_CRs();
   if (pi_length > PILEN)
      ourexit(2,"\nError: Length of processing instruction > PILEN.\n");
   outpi[pi_length] = '\0';
   (*applic)(PROC_INST,outpi,"");
   return; 
}

/*------------------------------------------------------*/
/*           G E T _ S T A T U S _ K E Y W O R D        */
/* This routine will parse the status keyword   */
/* specification of a marked secttion declaration. */
/* Zero or more status keywords are allowed in  */
/* the specification.  If none is specified, the   */
/* default of INCLUDE is returned.  If multiple */
/* keywords are defined, the following priority */
/* is used (highest shown first):         */
/*       "IGNORE"       */
/*       "CDATA"           */
/*       "RCDATA"       */
/*       "INCLUDE"         */
/*------------------------------------------------------*/
get_status_keyword(penthead)
ENTITYDESC *penthead;
{
   int inchar,retval;
   char keyname[NAMELEN+1];

   retval = MS_INCLUDE;  /* if none are specified, INCLUDE is assumed */
   while(inputps(penthead) > 0);
   gettilnosep();

   while((inchar=our_fgetc(indoc)) != '[') {
      our_ungetc(inchar,indoc);
      get_entname(keyname,our_toupper);
      if (strcmp(keyname,"IGNORE") == 0)
         retval = MAX(MS_IGNORE,retval);
      else
         if (strcmp(keyname,"CDATA") == 0)
            retval = MAX(MS_CDATA,retval);
         else
            if (strcmp(keyname,"RCDATA") == 0)
               retval = MAX(MS_RCDATA,retval);
            else
               if (strcmp(keyname,"INCLUDE") == 0)
                  retval = MAX(MS_INCLUDE,retval);
               else
                  if (strcmp(keyname,"TEMP") != 0)
                     ourexit(2,"\nError: Illegal status keyword in marked section\n");
      while(inputps(penthead) > 0);
      gettilnosep();
   }
   our_ungetc(inchar,indoc);
   return(retval);
}

/*--------------------------------------------------------------*/
/*             G E T T O K E N         */
/* This routine attempts at all costs to get a tag from  */
/* the document.  If a tag has already been read and  */
/* "ungettoken"d, then that tag is returned.  If not, */
/* then parsing continues eating up all comments and  */
/* processing instructions.  Marked sections are opened  */
/* and processed as far as possible, meaning until data  */
/* is found.                  */
/*--------------------------------------------------------------*/
TKNRETVAL gettoken(tp,token,genthead,penthead,get_ft)
int *token;
STPTR *tp;
ENTITYDESC *genthead,*penthead;
BOOLEAN *get_ft;
{
   char genid[NAMELEN+1];   /* generic identifier read from indoc */
   int curr_delim,  /* current delimiter working with */
      inchar,     /* current input character */
      open_token;
   unsigned
       nleng_spec_list, /* normalized length of specification list */
       num_id_idref;    /* number of ID and IDREF attribute values */
   STENTRY *opened_tp;
   TKNRETVAL retval;     /* either MARKUP_FOUND or TEXT */

   nleng_spec_list = num_id_idref = 0;
   curr_delim = PIO;

   if (state == GETNEW) {
      while(curr_delim==PIO || (curr_delim==MDO && !open_cdata_ms && !open_rcdata_ms)) { /* get input from input document */
         flush_buf();
         open_token = ((opened_tp=lookstack()) == NULL) ? rootid : opened_tp->tokenid;
         if (num_open_ms > 0)
            get_ms_closes();
         if (symtable[open_token].content_type==ELEMENT_CONTENT && 
                                           !open_cdata_ms && !open_rcdata_ms) {
            while ((inchar=our_fgetc(indoc))=='&' || inchar==RE || inchar==RS ||
                          inchar==SEPCHAR || inchar==SPACE || inchar==OUR_EE) {
               try_entref(inchar,genthead,TRUE,&dontcare);
               gettilnosep();
            }
            our_ungetc(inchar,indoc);
         }
         if (num_open_ms > 0)
            get_ms_closes();

         switch(curr_delim=getdelim()) {
         case NODELIM:  
         case EOF:
            retval = TEXT; /* if find EOF, just assume it was TEXT */
            break;
         case PIO:
            retval = TEXT;  /* just an assumption */
            flush_buf();  /* flush delimiter out */
            *get_ft = TRUE;
            get_pi();
            break;
         case MDO:
            retval = TEXT;  /* just an assumption */
            inchar = our_fgetc(indoc);
            our_ungetc(inchar,indoc);
            if (inchar == MARKUP_END) {  /* null comment */
               CLEAR_BUF();  /* clear out MDO */
               if ((inchar=our_fgetc(indoc)) != MARKUP_END)
                  ourexit(2,"\nError: MDO not found for comment declaration\n");
               STRIP_CRs();
            }
            else
               if (inchar == '-') {   /* regular comment */
                  CLEAR_BUF();  /* clear out MDO */
                  while(inputps(penthead) > 0);
                  if ((inchar=our_fgetc(indoc)) != MARKUP_END)
                     ourexit(2,"\nError: MDO not found for comment declaration\n");
                  STRIP_CRs();
               }
               else {
                  CLEAR_BUF();   /* flush delimiter out */
                  if (++num_open_ms > TAGLVL)
                     ourexit(2,"\nError: Number of open marked sections > TAGLVL\n");
                  get_marked_section(penthead);
               }
            break;
         case ETAGO:
            get_name(genid,our_toupper);
            sprintf(lastread_tag,"</%s>",genid);
            place_in_queue(END_TAG_NAME,genid,"");
            retval = MARKUP_FOUND;
            if ((*tp=(STPTR)bsearch(genid,symtable,numsym,sizeof(STENTRY),compare)) != NULL)
               *token = (*tp)->tokenid;
            else {
               sprintf(error_msg,"%s%s%s","\nError: Unknown generic identifier '",genid,"' in endtag.\n");
               FATAL_ERROR()
            }
            *token |= HIGHBIT;  /* turn high bit on for end tags */
            gettilnosep();
            if ((inchar=our_fgetc(indoc)) != MARKUP_END) {
               sprintf(error_msg,"%s%s%s","\nError: TAGC not found for '",genid,"'.\n");
               FATAL_ERROR()
            }
            putchar_outbuf(']');   /* TAGC to buffer */
            STRIP_CRs();
            *get_ft = TRUE;
            break;
         case STAGO:
            retval = get_starttag(token,tp,genthead,get_ft,&nleng_spec_list,&num_id_idref);
            break;
         default:
            software_fault();
         }  /*switch*/
      }  /*while*/
   }
   else {  /* get input from intermediate source, i.e. after ungettoken */
      state = GETNEW;  /* next time get from document */
      *token = holdtoken;
      *tp = holdtp;
      retval = MARKUP_FOUND;
   }
   if (nleng_spec_list > ATTSPLEN)
      ourexit(2,"\nError: Normalized length of attribute spec list > ATTSPLEN\n");
   if (num_id_idref > GRPCNT)
      ourexit(2,"\nError: Total number of id reference names > GRPCNT.\n");
   return(retval);
}

/*--------------------------------------------------------------*/
/*                    G E T _ S T A R T T A G                   */
/* This routine handles the processing of a start tag.   */
/* First the name of the tag is read and then a search   */
/* is made to ensure that the name is a valid generic */
/* identifier.  The attributes and their values are   */
/* then read in and verified one at a time.     */
/*--------------------------------------------------------------*/
TKNRETVAL get_starttag(token,tp,genthead,get_ft,nleng_spec_list,num_id_idref)
int *token;
STPTR *tp;
ENTITYDESC *genthead;
BOOLEAN *get_ft;
unsigned *nleng_spec_list,*num_id_idref;
{
   char genid[NAMELEN+1],   /* generic identifier read from indoc */
        attrname[NAMELEN+1];  /* name of attribute value */
   int inchar,     /* current input character */
       leng,       /* length of attribute name */
       temp_bufptr,
       tagsize;     /* current length of tag */
   ATTRDESC *thisadp;  /* points to description of attribute */
   BOOLEAN notat_specified;
   TKNRETVAL retval;     /* either MARKUP_FOUND or TEXT */

   notat_specified = FALSE;
   tagsize = get_name(genid,our_toupper);
   sprintf(lastread_tag,"<%s>",genid);
   place_in_queue(TAG_NAME,genid,"");
   retval = MARKUP_FOUND;
   if ((*tp=(STPTR)bsearch(genid,symtable,numsym,sizeof(STENTRY),compare)) != NULL)
      *token = (*tp)->tokenid;
   else {
      sprintf(error_msg,"%s%s%s","\nError: Unknown generic identifier '",genid,"'.\n");
      FATAL_ERROR()
   }
   (*tp)->cmptr->contref_attr = FALSE;
   unprocess((*tp)->adptr);
   tagsize += gettilnosep();
   temp_bufptr = bufptr;
   while((inchar=our_fgetc(indoc)) != MARKUP_END) {
      our_ungetc(inchar,indoc); 
      putchar_outbuf(' ');
      leng = get_name(attrname,our_toupper);
      *nleng_spec_list += leng+NORMSEP;
      tagsize += leng;
      if ((thisadp=find_attr(attrname,(*tp)->adptr)) == NULL) {
         sprintf(error_msg,"%s%s%s","\nError: Unknown attribute name'",attrname,"'.\n");
         FATAL_ERROR()
      }
      else
         if (thisadp->processed == TRUE) {
            sprintf(error_msg,"%s%s%s","\nError: Duplicate attribute specifications '",thisadp->attrname,"'.\n");
            FATAL_ERROR()
         }
         else
            *nleng_spec_list += get_attrvalue(thisadp,genthead,&tagsize,&((*tp)->cmptr->contref_attr),&(notat_specified));
      tagsize += gettilnosep(); 
   }
   bufptr = temp_bufptr;
   if (req_not_proc((*tp)->adptr) == TRUE) {
      sprintf(error_msg,"%s%s%s","\nError: REQUIRED or CURRENT attribute not specified '",
      (*tp)->adptr->attrname,"'.\n");
      FATAL_ERROR()
   }
   if (tagsize > TAGLEN)
      ourexit(3,"\nLength of undelimited start tag > TAGLEN.\n");
   *num_id_idref += resolve_attr((*tp)->adptr,FALSE);
   place_in_queue(TAG_END,"","");
   if ((*tp)->adptr == NULL)
      putchar_outbuf(']');
   else
      putstr_outbuf("\n]");
   STRIP_CRs();
   *get_ft = FALSE;
   return(retval);
}

/*------------------------------------------------------*/
/*             G E T _ A T T R V A L U E     */
/* This routine processes the attribute value part */
/* of an attribute specification.  The value is */
/* checked for correctness in terms of syntax as   */
/* well as semantics.            */
/*------------------------------------------------------*/
get_attrvalue(thisadp,genthead,taglen,contref,notat_specified)
ATTRDESC *thisadp;
ENTITYDESC *genthead;
int *taglen;
BOOLEAN *contref,*notat_specified;
{
   char name[NAMELEN+1],
        buffer[ATTSPLEN+1],
        idrefname[NAMELEN+1],
        idname[NAMELEN+1];
   unsigned
      length,
      nleng_attrval,
      num_csdata;
   register int inchar;
   int delim,
       val,
       (*getone)();
   BOOLEAN more_attr_vals;
   GROUPDESC *groupptr;

   (*taglen) += gettilnosep()+1; 
   length = 0; 
   nleng_attrval = NORMSEP;
   thisadp->processed = TRUE;   
   if ((inchar=our_fgetc(indoc)) != '=')      /* name has already been read */
      ourexit(2,"\nError: Invalid value indicator in attribute specification.\n");
   putchar_outbuf('=');
   (*taglen) += gettilnosep() + 1;

   if ((delim=our_fgetc(indoc))==LITA || delim==LIT)
      putchar_outbuf(delim);
   else
      ourexit(2,"\nError: LIT or LITA not specified in attribute specification\n");

   BLANK(buffer,ATTSPLEN+1);

   switch(thisadp->dvcode) {
   case NAME: 
   case NAMES:
   case NOTATION:
      getone = get_name;
      break;
   case NUMBER: 
   case NUMBERS:
      getone = get_number;
      break;
   case NMTOKEN:
   case NMTOKENS:
   case GROUP:
      getone = get_nmtoken;
      break;
   case NUTOKEN: 
   case NUTOKENS:
      getone = get_nutoken;
      break;
   }

   (*taglen) += process_attr(buffer,delim,genthead,thisadp->dvcode,&num_csdata);
   nleng_attrval += num_csdata*NORMSEP;
   (*taglen)++;  /* close delimiter */

   if (thisadp->dvcode != ENUM_CDATA)  {
      unget_string(buffer);
      gettilnosep();
   }

   if (thisadp->defcode == A_CONREF) {
      if (*notat_specified == TRUE)
         ourexit(2,"\nError: Content reference attribute not allowed with notation attribute.\n");
      *contref = TRUE;
   }

   more_attr_vals = TRUE;

   switch(thisadp->dvcode) {
   case ENUM_CDATA:
      putstr_outbuf(buffer);
      nleng_attrval += strlen(buffer);
      length = strlen(buffer);
      get_close(delim);
#ifdef OLD
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
#else
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
#endif
      break;
   case GROUP:
   case NOTATION:
      length = (*getone)(name,our_toupper);  /* GROUP is actually a nmtoken */
      if (thisadp->dvcode == NOTATION) {
         if (*contref == TRUE) {
            sprintf(error_msg,"%s%s%s","\nError: Notation attribute '",name,"' specified after content reference.\n");
            FATAL_ERROR()
         }
         *notat_specified = TRUE;
      }
      nleng_attrval += length+NORMSEP;

      /* value must have been defined as part of the group */
      if ((groupptr=find_group(name,thisadp->groupp)) == NULL) {
         sprintf(error_msg,"%s'%s'.\n","\nError: Unknown attribute group member ",name);
         FATAL_ERROR()
      }
      else
         thisadp->u2.currgrp = groupptr;
      get_close(delim);
#ifdef OLD
      check_fixed(thisadp->defcode,name,thisadp->u2.currgrp->groupname,NAMELEN);
#else
      check_fixed(thisadp->defcode,name,thisadp->u2.currgrp->groupname,NAMELEN);
#endif
      break;
   case NAME:  
   case NMTOKEN:  
   case NUTOKEN:  
   case NUMBER:
      length = (*getone)(buffer,our_toupper,FALSE);
      nleng_attrval += length+NORMSEP;
      get_close(delim);
#ifdef OLD
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef));
#else
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(buffer));
#endif
      break;
   case NUMBERS:  
   case NAMES:  
   case NMTOKENS:  
   case NUTOKENS:
      while(more_attr_vals) {   /* process each attribute value */
         /* in list, one at a time       */
         val = (*getone)(buffer+length,our_toupper);
         nleng_attrval += NORMSEP + val;
         length += val;
         if (gettilnosep() != 0)
            length++;
         more_attr_vals = ((inchar=our_fgetc(indoc)) != delim);
         if (inchar != delim) {
            putchar_outbuf(' ');
            our_ungetc(inchar,indoc);
         }
      }
      putchar_outbuf(inchar);
#ifdef OLD
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
#else
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
#endif
      break;
   case ID:
      length = get_name(buffer,our_toupper);
      nleng_attrval += length+NORMSEP;
      strcpy(idname,buffer);
      get_close(delim);
#ifdef OLD
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef));
#else
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(buffer));
#endif
      break;
   case IDREF:  
      length = get_name(buffer,our_toupper);
      nleng_attrval += length+NORMSEP;
      strcpy(idrefname,buffer);
      get_close(delim);
#ifdef OLD
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef));
#else
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(buffer));
#endif
      break;
   case IDREFS:  
      while(more_attr_vals) {  /* process each attribute value */
         /* of the list one at a time    */
         val = get_name(buffer+length,our_toupper);
         strncpy(idrefname,buffer+length,val);
         length += val; 
         nleng_attrval += NORMSEP+val;
         if (gettilnosep() != 0)
            length++;
         more_attr_vals = ((inchar=our_fgetc(indoc)) != delim);
         if (inchar != delim) {
            putchar_outbuf(' ');
            our_ungetc(inchar,indoc);
         }
      }
      putchar_outbuf(inchar);
#ifdef OLD
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
#else
      check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length);
#endif
      break;
   case ENTITY:
      length = get_name(name,nullfnc);
      nleng_attrval += length+NORMSEP;
      if (find_entity(genthead,name,FALSE) == NULL) {
         sprintf(error_msg,"%s'%s'.\n","\nError: Unknown attribute general entity name ",name);
         FATAL_ERROR()
      }
#ifdef OLD
      check_fixed(thisadp->defcode,name,thisadp->u2.currdef,strlen(thisadp->u2.currdef));
#else
      check_fixed(thisadp->defcode,name,thisadp->u2.currdef,strlen(name));
#endif
      get_close(delim);
      break;
   default:
      software_fault();
      break;
   }
   if (nleng_attrval > LITLEN)
      ourexit(2,"\nError: Normalized length of attribute value > LITLEN\n");
   if (thisadp->dvcode!=NOTATION && thisadp->dvcode!=GROUP) {
      if (thisadp->u2.currdef != NULL)
         free(thisadp->u2.currdef);
      thisadp->u2.currdef = get_char_mem(length+1);
      buffer[length] = '\0';
      strcpy(thisadp->u2.currdef,buffer);
   }
   return(nleng_attrval);
}

/*------------------------------------------------------*/
/*               G E T _ C D A T A _ M S                */
/*------------------------------------------------------*/
STATUS get_cdata_ms(firsttime)
BOOLEAN *firsttime;
{
   BOOLEAN moredata,cr_found,cdata_ms_ft;
   STATUS retval;
   int inchar;
   unsigned num_cr;
   char *outstr;

   flush_buf();  
   retval = NFDHT; 
   cdata_ms_ft = *firsttime;
   outstr = get_char_mem(2);
   moredata = TRUE;
   while(moredata && (inchar=our_fgetc(indoc))!=EOF) {
      cr_found = save_crs(&num_cr,&inchar);
      if (inchar == ']')
         if ((inchar=our_fgetc(indoc)) == ']')
            if ((inchar=our_fgetc(indoc)) == MARKUP_END) {
               moredata = FALSE;
               our_ungetc(MARKUP_END,indoc);
               unget_string("]]");
            }
            else {
               retval = FOUND;
               check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE);
               (*print_ctr)(ctrfp,"]]%c",inchar);
               (*applic)(DATA_STG,"]]","");
               *outstr = inchar;
               (*applic)(DATA_STG,outstr,"");
            }
         else {
            retval = FOUND;
            check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE);
            (*print_ctr)(ctrfp,"]%c",inchar);
            (*applic)(DATA_STG,"]","");
            *outstr = inchar;
            (*applic)(DATA_STG,outstr,"");
         }
      else {
         retval = FOUND;
         check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE);
         (*put_ctr)(inchar,ctrfp);
         *outstr = inchar;
         (*applic)(DATA_STG,outstr,"");
      }
   }
   open_cdata_ms = FALSE;
   free(outstr);
   return(retval);
}

/*------------------------------------------------------*/
/*        G E T _ C L O S E         */
/* This routine reads from 'indoc' for the   */
/* delimeter passed to it as a parameter.  If   */
/* the delimeter is not found, an error is raised. */
/*------------------------------------------------------*/
void get_close(delim)
int delim;
{
   int inchar;
   if ((inchar=our_fgetc(indoc)) != delim)
      ourexit(2,"\nError: Lit or lita delimeter not found in attribute literal.\n");
   else
      putchar_outbuf(inchar);
   return;
}

/*------------------------------------------------------*/
/*               G E T T I L N O S E P                  */
/*      This routine reads from the file until a   */
/* non-seperator is found.                         */
/*------------------------------------------------------*/
gettilnosep()
{
   register int indx;
   int inchar;

   indx = 0;
   /* notice we aren't writing unneeded seperators to output file */
   inchar=our_fgetc(indoc);    /* get character from file */
   while(SEPERATOR(inchar)) {
      inchar=our_fgetc(indoc);
      indx++;
   }
   our_ungetc(inchar,indoc);
   return(indx);
}

/*------------------------------------------------------*/
/*       G E T _ C H A R _ M E M       */
/*      This routine allocates memory for character     */
/*      data and raises an error condition if there     */
/*      is insufficient memory for the allocation. */
/*------------------------------------------------------*/
char *get_char_mem(number)
int number;
{
   char *retptr,*calloc();
   if ((retptr=calloc(number,sizeof(char))) == NULL)
      ourexit(2,"\nInsufficient memory in parse3\n");
   return(retptr);
}

/*------------------------------------------------------*/
/*    G E T _ M S _ C L O S E S     */
/*   This routine reads from 'indoc' as many */
/*      marked section closes as possible.      */
/*------------------------------------------------------*/
void get_ms_closes()
{
   int inchar,open_token;
   STENTRY *opened_tp;
   BOOLEAN more_ms_closes=TRUE;

   while(more_ms_closes && (inchar=our_fgetc(indoc))!=EOF) {
      if (inchar == ']')
         if ((inchar=our_fgetc(indoc)) == ']')
            if ((inchar=our_fgetc(indoc)) == MARKUP_END) {
               if (--num_open_ms == 0)
                  more_ms_closes = FALSE;
               if ((inchar=our_fgetc(indoc)) != OUR_EE)
                  our_ungetc(inchar,indoc);
            }
            else {
               our_ungetc(inchar,indoc);  
               unget_string("]]");
               more_ms_closes = FALSE;
            }
         else {
            our_ungetc(inchar,indoc);
            our_ungetc(']',indoc);
            more_ms_closes = FALSE;
         }
      else {
         our_ungetc(inchar,indoc);
         more_ms_closes = FALSE;
      }
      open_token = ((opened_tp=lookstack()) == NULL) ? rootid : opened_tp->tokenid;

      if (symtable[open_token].content_type == ELEMENT_CONTENT)
         gettilnosep();   /* seperators are allowed between tags */
   }
   return;
}
