/*
 * tlex.c -- the lexical analyzer.
 */

#include "..\h\config.h"
#include "general.h"
#include "tproto.h"
#include "trans.h"
#include "token.h"
#include "tlex.h"
#include "tree.h"
#include <ctype.h>

#if MACINTOSH
#if MPW
#include <CursorCtl.h>
#define CURSORINTERVAL 100
#endif					/* MPW */
#endif					/* MACINTOSH */

/*
 * Prototypes.
 */

hidden	int		ctlesc		Params((noargs));
hidden	struct toktab   *findres	Params((noargs));
hidden	struct toktab   *getident	Params((int ac,int *cc));
hidden	struct toktab   *getnum		Params((int ac,int *cc));
hidden	struct toktab   *getopr		Params((int ac,int *cc));
hidden	struct toktab   *getstring	Params((int ac,int *cc));
hidden	int		hexesc		Params((noargs));
hidden	int		nextchar	Params((noargs));
hidden	int		octesc		Params((int ac));
hidden	int		setfilenm	Params((int c));
hidden	int		setlineno	Params((noargs));

#define isletter(s)	(isupper(c) | islower(c))

#if EBCDIC
extern char ToEBCDIC[256], FromEBCDIC[256];
#endif					/* EBCDIC */

#if !EBCDIC
#define tonum(c)        (isdigit(c) ? (c - '0') : ((c & 037) + 9))

/*
 * esctab - translates single-character escapes in string literals.
 */

static char esctab[] = {
   000,   001,   002,   003,   004,   005,   006,   007,   /* NUL-BEL */
   010,   011,   012,   013,   014,   015,   016,   017,   /* BS -SI */
   020,   021,   022,   023,   024,   025,   026,   027,   /* DLE-ETB */
   030,   031,   032,   033,   034,   035,   036,   037,   /* CAN-US */
   ' ',   '!',   '"',   '#',   '$',   '%',   '&',   '\'',  /* !"#$%&' */
   '(',   ')',   '*',   '+',   ',',   '-',   '.',   '/',   /* ()*+,-./ */
   000,   001,   002,   003,   004,   005,   006,   007,   /* 01234567 */
   010,   011,   ':',   ';',   '<',   '=',   '>',   '?',   /* 89:;<=>? */
   '@',   'A',   '\b',  'C',   0177,  033,   014,   'G',   /* @ABCDEFG */
   'H',   'I',   'J',   'K',   '\n',  'M',  '\n',   'O',   /* HIJKLMNO */
   'P',   'Q',   '\r',  'S',   '\t',  'U',   013,   'W',   /* PQRSTUVW */
   'X',   'Y',   'Z',   '[',   '\\',  ']',   '^',   '_',   /* XYZ[\]^_ */
   '`',   'a',   '\b',  'c',   0177,  033,   014,   'g',   /* `abcdefg */
   'h',   'i',   'j',   'k',   '\n',  'm',   '\n',  'o',   /* hijklmno */
   'p',   'q',   '\r',  's',   '\t',  'u',   013,   'w',   /* pqrstuvw */
   'x',   'y',   'z',   '{',   '|',   '}',   '~',   0177,  /* xyz{|}~ */
   0200,  0201,  0202,  0203,  0204,  0205,  0206,  0207,
   0210,  0211,  0212,  0213,  0214,  0215,  0216,  0217,
   0220,  0221,  0222,  0223,  0224,  0225,  0226,  0227,
   0230,  0231,  0232,  0233,  0234,  0235,  0236,  0237,
   0240,  0241,  0242,  0243,  0244,  0245,  0246,  0247,
   0250,  0251,  0252,  0253,  0254,  0255,  0256,  0257,
   0260,  0261,  0262,  0263,  0264,  0265,  0266,  0267,
   0270,  0271,  0272,  0273,  0274,  0275,  0276,  0277,
   0300,  0301,  0302,  0303,  0304,  0305,  0306,  0307,
   0310,  0311,  0312,  0313,  0314,  0315,  0316,  0317,
   0320,  0321,  0322,  0323,  0324,  0325,  0326,  0327,
   0330,  0331,  0332,  0333,  0334,  0335,  0336,  0337,
   0340,  0341,  0342,  0343,  0344,  0345,  0346,  0347,
   0350,  0351,  0352,  0353,  0354,  0355,  0356,  0357,
   0360,  0361,  0362,  0363,  0364,  0365,  0366,  0367,
   0370,  0371,  0372,  0373,  0374,  0375,  0376,  0377,
  };
#else                                   /* !EBCDIC */
/*
 *  This is the EBCDIC table for handling escapes.
 */
static char esctab[] = {
   0x00,  0x01,  0x02,  0x03,  0x04,  0x05,  0x06,  0x07,
   0x08,  0x09,  0x0a,  0x0b,  0x0c,  0x0d,  0x0e,  0x0f,
   0x10,  0x11,  0x12,  0x13,  0x14,  0x15,  0x16,  0x17,
   0x18,  0x19,  0x1a,  0x1b,  0x1c,  0x1d,  0x1e,  0x1f,
   0x20,  0x21,  0x22,  0x23,  0x24,  0x25,  0x26,  0x27,
   0x28,  0x29,  0x2a,  0x2b,  0x2c,  0x2d,  0x2e,  0x2f,
   0x30,  0x31,  0x32,  0x33,  0x34,  0x35,  0x36,  0x37,
   0x38,  0x39,  0x3a,  0x3b,  0x3c,  0x3d,  0x3e,  0x3f,
   ' ',   0x41,  0x42,  0x43,  0x44,  0x45,  0x46,  0x47,
   0x48,  0x49,  0x4a,  0x4b,  0x4c,  0x4d,  0x4e,  0x4f,
   0x50,  0x51,  0x52,  0x53,  0x54,  0x55,  0x56,  0x57,
   0x58,  0x59,  0x5a,  0x5b,  0x5c,  0x5d,  0x5e,  0x5f,
   0x60,  0x61,  0x62,  0x63,  0x64,  0x65,  0x66,  0x67,
   0x68,  0x69,  0x6a,  0x6b,  0x6c,  0x6d,  0x6e,  0x6f,
   0x70,  0x71,  0x72,  0x73,  0x74,  0x75,  0x76,  0x77,
   0x78,  0x79,  0x7a,  0x7b,  0x7c,  0x7d,  0x7e,  0x7f,
   0x80,  'a',   0x16,  'c',   0x07,  0x27,  0x0c,  'g',
   'h',   'i',   0x8a,  0x8b,  0x8c,  0x8d,  0x8e,  0x8f,

#if EBCDIC == 2
   0x90,  'j',   'k',   0x15,  'm',   0x15,  'o',   'p',
#else					/* EBCDIC == 2 */
   0x90,  'j',   'k',   0x25,  'm',   0x15,  'o',   'p',
#endif					/* EBCDIC == 2 */

   'q',   0x0d,  0x9a,  0x9b,  0x9c,  0x9d,  0x9e,  0x9f,
   0xa0,  0xa1,  's',   0x05,  'u',   0x0b,  'w',   'x',
   'y',   'z',   0xaa,  0xab,  0xac,  0xad,  0xae,  0xaf,
   0xb0,  0xb1,  0xb2,  0xb3,  0xb4,  0xb5,  0xb6,  0xb7,
   0xb8,  0xb9,  0xba,  0xbb,  0xbc,  0xbd,  0xbe,  0xbf,
   0xc0,  'A',   0x16,  'C',   0x07,  0x27,  0x0c,  'G',
   'H',   'I',   0xca,  0xcb,  0xcc,  0xcd,  0xce,  0xcf,

#if EBCDIC == 2
   0xd0,  'J',   'K',   0x15,  'M',   0x15,  'O',   'P',
#else					/* EBCDIC == 2 */
   0xd0,  'J',   'K',   0x25,  'M',   0x15,  'O',   'P',
#endif					/* EBCDIC == 2 */

   'Q',   0x0d,  0xda,  0xdb,  0xdc,  0xdd,  0xde,  0xdf,
   0xe0,  0xe1,  'S',   0x05,  'U',   0x0b,  'W',   'X',
   'Y',   'Z',   0xea,  0xeb,  0xec,  0xed,  0xee,  0xef,
   0,   1,   2,   3,     4,     5,     6,     7,
   8,   9,   0xfa,   0xfb,  0xfc,  0xfd,  0xfe,  0xff,
   };
#endif					/* !EBCDIC */

struct node tok_loc =
   {0, NULL, 0, 0};	/* "model" node containing location of current token */

/*
 * yylex - find the next token in the input stream, and return its token
 *  type and value to the parser.
 *
 * Variables of interest:
 *
 *  cc - character following last token.
 *  comflag - set if in a comment.
 *  nlflag - set if a newline was between the last token and the current token
 *  lastend - set if the last token was an Ender.
 *  lastval - when a semicolon is inserted and returned, lastval gets the
 *   token value that would have been returned if the semicolon hadn't
 *   been inserted.
 */

static struct toktab *lasttok = NULL;
static int lastend = 0;
static int eofflag = 0;
static int cc = '\n';

int yylex()
   {
   register struct toktab *t;
   register int c;
   int nlflag;
   int comflag;
   static nodeptr lastval;
   static struct node semi_loc;

   if (lasttok != NULL) {
      /*
       * A semicolon was inserted and returned on the last call to yylex,
       *  instead of going to the input, return lasttok and set the
       *  appropriate variables.
       */

      yylval = lastval;
      tok_loc = *lastval;
      t = lasttok;
      goto ret;
      }
   nlflag = 0;
   comflag = 0;
loop:
   c = cc;
   /*
    * Remember where a semicolon will go if we insert one.
    */
   semi_loc.n_file = tok_loc.n_file;
   semi_loc.n_line = in_line;
   semi_loc.n_col = incol;
   /*
    * Skip whitespace and comments and process #line directives.
    */
   while (c == Comment || isspace(c)) {
      if (c == '\n') {
         nlflag++;
         c = NextChar;
	 if (c == Comment) {
            /*
	     * Check for #line directive at start of line.
             */
            if (('l' == (c = NextChar)) &&
                ('i' == (c = NextChar)) &&
                ('n' == (c = NextChar)) &&
                ('e' == (c = NextChar))) {
               c = setlineno();
	       while ((c == ' ') || (c == '\t'))
		  c = NextChar;
               if (c != EOF && c != '\n')
                  c = setfilenm(c);
	       }
	    while (c != EOF && c != '\n')
               c = NextChar;
	    }
         }
      else {
	 if (c == Comment) {
	    while (c != EOF && c != '\n')
               c = NextChar;
	    }
         else {
            c = NextChar;
            }
         }
      }
   /*
    * A token is the next thing in the input.  Set token location to
    *  the current line and column.
    */
   tok_loc.n_line = in_line;
   tok_loc.n_col = incol;

   if (c == EOF) {
      /*
       * End of file has been reached.	Set eofflag, return T_Eof, and
       *  set cc to EOF so that any subsequent scans also return T_Eof.
       */
      if (eofflag++) {
	 eofflag = 0;
	 cc = '\n';
	 yylval = NULL;
	 return 0;
	 }
      cc = EOF;
      t = T_Eof;
      yylval = NULL;
      goto ret;
      }

   /*
    * Look at current input character to determine what class of token
    *  is next and take the appropriate action.  Note that the various
    *  token gathering routines write a value into cc.
    */
   if (isalpha(c) || (c == '_')) {   /* gather ident or reserved word */
      if ((t = getident(c, &cc)) == NULL)
	 goto loop;
      }
   else if (isdigit(c)) {		/* gather numeric literal */
      if ((t = getnum(c, &cc)) == NULL)
	 goto loop;
      }
   else if (c == '"' || c == '\'') {    /* gather string or cset literal */
      if ((t = getstring(c, &cc)) == NULL)
	 goto loop;
      }
   else {			/* gather longest legal operator */
      if ((t = getopr(c, &cc)) == NULL)
	 goto loop;
      yylval = OpNode(t->t_type);
      }
   if (nlflag && lastend && (t->t_flags & Beginner)) {
      /*
       * A newline was encountered between the current token and the last,
       *  the last token was an Ender, and the current token is a Beginner.
       *  Return a semicolon and save the current token in lastval.
       */
      lastval = yylval;
      lasttok = t;
      tok_loc = semi_loc;
      yylval = OpNode(SEMICOL);
      return SEMICOL;
      }
ret:
   /*
    * Clear lasttok, set lastend if the token being returned is an
    *  Ender, and return the token.
    */
   lasttok = 0;
   lastend = t->t_flags & Ender;
   return (t->t_type);
   }

#ifdef MultipleRuns
/*
 * yylexinit - initialize variables for multiple runs
 */
novalue yylexinit()
   {
   lasttok = NULL;
   lastend = 0;
   eofflag = 0;
   cc = '\n';
   }

#endif					/* MultipleRuns */
/*
 * getident - gather an identifier beginning with ac.  The character
 *  following identifier goes in cc.
 */

static struct toktab *getident(ac, cc)
int ac;
int *cc;
   {
   register int c;
   register char *p;
   register struct toktab *t;

   c = ac;
   p = strf;
   /*
    * Copy characters into string space until a non-alphanumeric character
    *  is found.
    */
   do {
      if (p >= stre)
	 tsyserr("out of string space");
      *p++ = c;
      c = NextChar;
      } while (isalnum(c) || (c == '_'));
   if (p >= stre)
      tsyserr("out of string space");
   *p++ = 0;
   *cc = c;
   /*
    * If the identifier is a reserved word, make a ResNode for it and return
    *  the token value.  Otherwise, install it with putid, make an
    *  IdNode for it, and return.
    */
   if ((t = findres()) != NULL) {
      yylval = ResNode(t->t_type);
      return t;
      }
   else {
      yylval = IdNode(putid((int)(p-strf)));
      return (struct toktab *)T_Ident;
      }
   }

/*
 * findres - if the string just copied into the string space by getident
 *  is a reserved word, return a pointer to its entry in the token table.
 *  Return NULL if the string isn't a reserved word.
 */

static struct toktab *findres()
   {
   register struct toktab *t;
   register char c, *p;

   p = strf;
   c = *p;
   if (!islower(c))
      return NULL;
   /*
    * Point t at first reserved word that starts with c (if any).
    */
   if ((t = restab[c - 'a']) == NULL)
      return NULL;
   /*
    * Search through reserved words, stopping when a match is found
    *  or when the current reserved word doesn't start with c.
    */
   while (t->t_word[0] == c) {
      if (strcmp(t->t_word, p) == 0)
	 return t;
      t++;
      }
   return NULL;
   }

/*
 * getnum - gather a numeric literal starting with ac and put the
 *  character following the literal into *cc.
 */

static struct toktab *getnum(ac, cc)
int ac;
int *cc;
   {
   register int c, r, state;
   char *p;
   int realflag;

   c = ac;
   r = tonum(c);
   p = strf;
   state = 0;
   realflag = 0;
   for (;;) {
      if (p >= stre)
	 tsyserr("out of string space");
      *p++ = c;
      c = NextChar;
      switch (state) {
	 case 0:		/* integer part */
	    if (isdigit(c))	    { r = r * 10 + tonum(c); continue; }
	    if (c == '.')           { state = 1; realflag++; continue; }
	    if (c == 'e' || c == 'E')  { state = 2; realflag++; continue; }
	    if (c == 'r' || c == 'R')  {
	       state = 5;
	       if (r < 2 || r > 36)
		  tfatal("invalid radix for integer literal", (char *)NULL);
	       continue;
	       }
	    break;
	 case 1:		/* fractional part */
	    if (isdigit(c))   continue;
	    if (c == 'e' || c == 'E')   { state = 2; continue; }
	    break;
	 case 2:		/* optional exponent sign */
	    if (c == '+' || c == '-') { state = 3; continue; }
	 case 3:		/* first digit after e, e+, or e- */
	    if (isdigit(c)) { state = 4; continue; }
	    tfatal("invalid real literal", (char *)NULL);
	    break;
	 case 4:		/* remaining digits after e */
	    if (isdigit(c))   continue;
	    break;
	 case 5:		/* first digit after r */
	    if ((isdigit(c) || isletter(c)) && tonum(c) < r)
	       { state = 6; continue; }
	    tfatal("invalid integer literal", (char *)NULL);
	    break;
	 case 6:		/* remaining digits after r */
	    if (isdigit(c) || isletter(c)) {
	       if (tonum(c) >= r) {	/* illegal digit for radix r */
		  tfatal("invalid digit in integer literal", (char *)NULL);
		  r = tonum('z');       /* prevent more messages */
		  }
	       continue;
	       }
	    break;
	 }
      break;
      }
   if (p >= stre)
      tsyserr("out of string space");
   *p++ = 0;
   *cc = c;
   if (realflag) {
      yylval = RealNode(putid((int)(p-strf)));
      return T_Real;
      }
   yylval = IntNode(putid((int)(p-strf)));
   return T_Int;
   }

/*
 * getstring - gather a string literal starting with ac and place the
 *  character following the literal in *cc.
 */

static struct toktab *getstring(ac, cc)
int ac;
int *cc;
   {
   register int c, sc;
   register char *p;
   char *lc;
   int len;

   sc = c = ac;
   p = strf;
   lc = 0;
   while ((c = NextChar) != sc && c != '\n' && c != EOF) {
   contin:
      if (c == '_')
	 lc = p;
      else if (!isspace(c))
	 lc = 0;
      if (c == Escape) {
	 c = NextChar;

#ifdef VarTran
	 *p++ = Escape;
#else					/* VarTran */
	 if (isoctal(c))
	    c = octesc(c);
	 else if (c == 'x')
	    c = hexesc();
	 else if (c == '^')
	    c = ctlesc();
	 else
	    c = esctab[c];
#endif					/* VarTran */

	 if (c == EOF)
	    goto noquote;
	 }
      if (p >= stre)
	 tsyserr("out of string space");
      *p++ = c;
      }
   if (p >= stre)
      tsyserr("out of string space");
   *p++ = 0;
   if (c == sc)
      *cc = ' ';
   else {
      if (c == '\n' && lc) {
	 p = lc;
	 while ((c = NextChar) != EOF && isspace(c)) ;
	 if (c != EOF)
	    goto contin;
	 }
noquote:
      tfatal("unclosed quote", (char *)NULL);
      *cc = c;
      }
   if (ac == '"') {     /* a string literal */
      len = p - strf;
      yylval = StrNode(putid((int)len), len);
      return T_String;
      }
   else {		/* a cset literal */
      len = p - strf;
      yylval = CsetNode(putid((int)len), len);
      return T_Cset;
      }
   }

#ifndef VarTran

/*
 * ctlesc - translate a control escape -- backslash followed by
 *  caret and one character.
 */

static int ctlesc()
   {
   register int c;

   c = NextChar;
   if (c == EOF)
      return EOF;

#if !EBCDIC
   return (c & 037);
#else					/* !EBCDIC */
   return ToEBCDIC[FromEBCDIC[c] & 037];
                        /* ctrl-x in EBCDIC is the EBCDIC equivalent */
                        /* to ASCII ctrl-x                           */
#endif					/* !EBCDIC */

   }

/*
 * octesc - translate an octal escape -- backslash followed by
 *  one, two, or three octal digits.
 */

static int octesc(ac)
int ac;
   {
   register int c, nc, i;

   c = 0;
   nc = ac;
   i = 1;
   do {
      c = (c << 3) | (nc - '0');
      nc = NextChar;
      if (nc == EOF)
	 return EOF;
      } while (isoctal(nc) && i++ < 3);
   PushChar(nc);

#if EBCDIC != 2
   return (c & 0377);
#else					/* EBCDIC != 2 */
   return ToEBCDIC[c & 0377];
#endif					/* EBCDIC != 2 */
   }

/*
 * hexesc - translate a hexadecimal escape -- backslash-x
 *  followed by one or two hexadecimal digits.
 */

static int hexesc()
   {
   register int c, nc, i;

   c = 0;
   i = 0;
   while (i++ < 2) {
      nc = NextChar;
      if (nc == EOF)
	 return EOF;
      if (nc >= 'a' && nc <= 'f')
	 nc -= 'a' - 10;
      else if (nc >= 'A' && nc <= 'F')
	 nc -= 'A' - 10;
      else if (isdigit(nc))
	 nc -= '0';
      else {
	 PushChar(nc);
	 break;
	 }
      c = (c << 4) | nc;
      }

#if EBCDIC != 2
   return c;
#else					/* EBCDIC != 2 */
   return ToEBCDIC[c];
#endif					/* EBCDIC != 2 */

   }

#endif					/* VarTran */

/*
 * getopr - find the longest legal operator and return a pointer
 *  to its entry in the token table.
 */

static struct toktab *getopr(ac, cc)
int ac;
int *cc;
   {
   register struct optab *state;
   register char c, i;

   state = state0;
   c = ac;
   for (;;) {
      while ((i = state->o_input) && c != i)
	 state++;
      switch (state->o_action) {
	 case A_Goto:
	    state = (struct optab *) state->o_val;
	    c = NextChar;
	    continue;
	 case A_Error:
	    tfatal("invalid character", (char *)NULL);
	    *cc = ' ';
	    return NULL;
	 case A_Return:
	    *cc = c;
	    return (struct toktab *)(state->o_val);
	 case A_Immret:
	    *cc = ' ';
	    return (struct toktab *)(state->o_val);
	 }
      }
   }

/*
 * setlineno - set line number from #line comment, return following char.
 */

static int setlineno()
   {
   register int c;

   while ((c = NextChar) == ' ' || c == '\t')
      ;
   if (c < '0' || c > '9') {
      tfatal("no line number in #line directive", "");
      while (c != EOF && c != '\n')
	 c = NextChar;
      return c;
      }
   in_line = 0;
   while (c >= '0' && c <= '9') {
      in_line = in_line * 10 + (c - '0');
      c = NextChar;
      }
   return c;
   }

/*
 * setfilenm -	set file name from #line comment, return following char.
 *
 * Assigning to comfile here does not provide the fine-grained
 * control over filenames required by a real macro processor.
 * setloc() in tcode.c ought to be restored to its earlier form and
 * the initialization of filenames fixed.
 */

static int setfilenm(c)
register int c;
   {
   extern char *comfile;
   register char *p;

   while (c == ' ' || c == '\t')
      c = NextChar;
   if (c != '"') {
      tfatal("'\"' missing from file name in #line directive", "");
      while (c != EOF && c != '\n')
	 c = NextChar;
      return c;
      }
   p = strf;
   while ((c = NextChar) != '"' && c != EOF && c != '\n') {
      if (p >= stre)
	 tsyserr("out of string space");
      *p++ = c;
      }
   *p++ = '\0';
   if (c == '"') {
      tok_loc.n_file = putid((int)(p-strf));
      return NextChar;
      }
   else {
      tfatal("'\"' missing from file name in #line directive", "");
      return c;
      }
   }

/*
 * nextchar - return the next character in the input.
 */

static int nextchar()
   {
   register int c;

#if MACINTOSH
#if MPW
   {
   static short cursorcount = CURSORINTERVAL;
   if (--cursorcount == 0) {
      RotateCursor(0);
      cursorcount = CURSORINTERVAL;
      }
   }
#endif					/* MPW */
#endif					/* MACINTOSH */

   if (c = peekc) {
      peekc = 0;
      return c;
      }
   c = getc(srcfile);
   switch (c) {
      case EOF:
	 if (incol) {
	    c = '\n';
	    in_line++;
	    incol = 0;
	    peekc = EOF;
	    break;
	    }
	 else {
	    in_line = 0;
	    incol = 0;
	    break;
	    }
      case '\n':
	 in_line++;
	 incol = 0;
	 break;
      case '\t':
	 incol = (incol | 7) + 1;
	 break;
      case '\b':
	 if (incol)
	    incol--;
	 break;
      default:
	 incol++;
      }
   return c;
   }
