/*
 EPSHeader

   File: asmtag.c
   Author: J. Kercheval
   Created: Sun, 07/14/1991  17:25:26
*/
/*
 EPSRevision History

   J. Kercheval  Sun, 07/14/1991  20:25:59  creation
   J. Kercheval  Mon, 07/15/1991  22:47:30  finish finite state machine parser
   J. Kercheval  Wed, 07/17/1991  21:35:43  add IsMember() and get_token()
   J. Kercheval  Thu, 07/18/1991  19:57:34  add flags checking
   J. Kercheval  Sun, 07/21/1991  15:58:56  add comment block support
   J. Kercheval  Sat, 07/27/1991  21:16:53  remove public post process support
   J. Kercheval  Sat, 07/27/1991  22:50:49  performance considerations (+10%)
   J. Kercheval  Sat, 08/10/1991  18:14:46  Speed up IsMember()
   J. Kercheval  Fri, 09/13/1991  01:17:05  add when_loading() to remap def_srch_case_map[]
   J. Kercheval  Thu, 10/03/1991  12:27:37  fix logic outputting local labels
   J. Kercheval  Sat, 10/05/1991  14:06:33  add ASMTagWant defines
*/

/*
 * This file implements tagging for .ASM and .INC files which contain 80x86
 * assembler in the MASM/TASM syntax.  This file defines no new commands and
 * is intended to work with the tags package included with V5.0 of Epsilon.
 * There is no problem using modified tags packages providing calls are made
 * to tags_suffix_???() routines in the same way Epsilon does this and that
 * an output routine add_tag() is used.  All that should be required is to
 * compile and load this file and this module will be used transparently to
 * you.
 *
 * This module implements tagging for UNION, STRUC, MACRO, PROC, LABEL
 * keywords as well as for implicit labels (label:) and for data defintions
 * (ie. equ, =, dq, dw, db, etc....).  The performance cost on a per tag
 * basis is negligable, but since more tagging is done, you should expect a
 * practical 10%-20% performance hit on a per file basis.  This tagger is not
 * intended to do all of your work for you but is designed to be used in
 * conjunction with the tags generator I have developed and is now available.
 * This file implements the same semantic parser as is found in that
 * executable.  Use the executable in your make file for very fast and
 * updated tags.  If you have problems finding it, contact me and I can point
 * the way...
 *
 * There is defined at the end of this module a when_loading() function which
 * alters the default search case map to allow *correct* (or at least
 * consistent sorting with sort routines external to Epsilon.  In particular,
 * to produce the same sort order as any UNIX, VMS or HP style sort or with
 * the tags generator this module is supposed to coexist with this mapping
 * must be done.  You should see no difference in the location of sorted
 * buffers except for lines starting with ^, [, \, ] and _.
 *
 * This code is dedicated to the public domain with the caveat that Lugaru is
 * welcome to use this within their distribution source code which is
 * supplied with Epsilon.
 *
 * Good Tagging,
 *
 *      jbk@wrq.com
 *
 *      John Kercheval
 *      127 NW Bowdoin Pl #105
 *      Seattle, WA  98107-4960
 *      August 10, 1991
 */

#include <eel.h>

#ifndef BOOLEAN
#define BOOLEAN int
#define TRUE 1
#define FALSE 0
#endif

/* This is a list of the types of tokens you may want to tag.  Set them true
 * if you want that particular type of tag. 
 */
#define ASMTagWantProc TRUE
#define ASMTagWantMacro TRUE
#define ASMTagWantLabel TRUE
#define ASMTagWantStruc TRUE
#define ASMTagWantUnion TRUE
#define ASMTagWantDefine TRUE

/*
 * The finite state machine allows the following interesting paths
 *
 *    1 - Discard, Parse1, Symbol1
 *    2 - Discard, Parse1, Parse2, Symbol2
 *    3 - Discard, Parse1, Parse2, Define
 *
 * all the important cases follow one of these paths according to MASM/TASM
 * syntax.  The exit state is for finish up routine calls and some paths not
 * covered here are simple error paths and probably result from syntax errors
 *
 *  enum state { Discard, Parse1, Parse2, Symbol1, Symbol2, Define, Exit };
 */
/*
 * emulate an enumerated type for the state machine
 */

#define Discard 0
#define Parse1  1
#define Parse2  2
#define Symbol1 3
#define Symbol2 4
#define Define  5
#define Exit    6

typedef int State;

#define COMMENT_CHAR ';'

#define SYMBOL_SIZE 15


/*----------------------------------------------------------------------------
 *
 * The symbol lists represent all the symbols we are interested in either
 * obtaining or ignoring.  The first element of each of these symbol lists is
 * a string containing all the first characters within the symbol list.  This
 * allows faster rejection for IsMember() which is called often.
 *
 ---------------------------------------------------------------------------*/

/* symbols which are not significant for this parser */
char ASM_NOP_Sym[][SYMBOL_SIZE] =
{
    "cpbfnwo",                  /* list of starting characters of symbols
                                 * below */
    "c",                        /* C language declaration */
    "pascal",                   /* PASCAL language declaration */
    "basic",                    /* BASIC language declaration */
    "fortran",                  /* FORTRAN language declaration */
    "prolog",                   /* PROLOG language declaration */
    "nolanguage",               /* generic language declaration */
    "windows",                  /* WINDOWS exit and entry modifier */
    "oddnear",                  /* overlay modifier */
    "oddfar",                   /* overlay modifier */
    "normal",                   /* normal procedure entry/exit code */
    "\0"
};

/* symbols which begin a comment block */
char ASM_comment_block[][SYMBOL_SIZE] =
{
    "c",                        /* list of starting characters of symbols
                                 * below */
    "comment",                  /* begin comment block, next character is
                                 * delimiter */
    "\0"
};


/* create the function for determining if a character is a delimiter */
#define IsDelim(c) ( _ASM_delim_boolean_table[c] )

/* the indexed table for white space character lookup */
BOOLEAN _ASM_delim_boolean_table[256];

/* valid delimiters for this syntax */
char ASM_delim[] = " \t\n;:=.,\"()<>[]*-+/";


/* create the function for determining if a character is a whitespace */
#define IsWhite(c) ( _ASM_white_boolean_table[c] )

/* the indexed table for white space character lookup */
BOOLEAN _ASM_white_boolean_table[256];

/* whitespace characters */
char ASM_white[] = " \t\v\f";


/* symbols which both are delimiters and a special token, these are
    special tokens only when found at the the beginning of a string of
    1 or more delimiters */
char ASM_delim_Sym[] = "=:";

/* symbols which fit into the Define state and represent a tagged symbol */
/* state Define depends on the token ":" being at index 1 in this list */
char ASM_def[][SYMBOL_SIZE] =
{
    ":e=cd",                    /* list of starting characters of symbols
                                 * below */
    ":",                        /* local labels */
    "equ",                      /* equivalence */
    "=",                        /* equivalence */
    "catstr",                   /* concatenated and named strings */
    "db",                       /* named byte data definition */
    "dw",                       /* named word data definition */
    "dd",                       /* named double word data definition */
    "dp",                       /* named 6 byte far pointer data area
                                 * definition */
    "df",                       /* named 6 byte far pointer definition */
    "dq",                       /* named quad word data definition */
    "dt",                       /* named 10 byte data area */
    "\0"
};

/* symbols which fit into the Symbol state and represent a tagged symbol */
char ASM_sym[][SYMBOL_SIZE] =
{
    "pmlsu",                    /* list of starting character of symbols
                                 * below */
    "proc",                     /* procedures */
    "macro",                    /* macros */
    "label",                    /* local labels */
    "struc",                    /* structures */
    "union",                    /* unions */
    "\0"
};


/*----------------------------------------------------------------------------
 *
 * ASMParserInit() initializes the tables required by the parser The tables
 * used are a simple boolean index which are true if the character
 * corresponding to the index is a member of the associated table.
 *
 ---------------------------------------------------------------------------*/

ASMParserInit()
{
    char *s;
    int i;

    /* init the entire block to FALSE */
    for (i = 0; i < 256; i++) {
        _ASM_delim_boolean_table[i] = FALSE;
        _ASM_white_boolean_table[i] = FALSE;
    }

    /* set the characters in the delim set to TRUE */
    for (s = ASM_delim; *s; s++) {
        _ASM_delim_boolean_table[*s] = TRUE;
    }

    /* NULL is also a delimiter */
    _ASM_delim_boolean_table['\0'] = TRUE;

    /* set the characters in the white set to TRUE */
    for (s = ASM_white; *s; s++) {
        _ASM_white_boolean_table[*s] = TRUE;
    }
}


/*----------------------------------------------------------------------------
 *
 * strchr() is the standard string library function strchr()
 *
 ---------------------------------------------------------------------------*/

char *strchr(s, c)
    char *s;
    char c;
{
    char *ret = s;

    while (*ret) {
        if (*ret == c)
            return ret;
        ret++;
    }

    if (*ret == c)
        return ret;

    return NULL;
}


/*----------------------------------------------------------------------------
 *
 * ASMSymbolWanted() returns true if the index into the sym token list is one
 * of the wanted symbols according to the ASMTagWant defines.  The indexes 
 * belong with the following ASMTagWant defines:
 *
 *          Flag             Symbol   Index
 *          ---------------  -------  -----
 *          ASMTagWantProc   "proc"   1
 *          ASMTagWantMacro  "macro"  2
 *          ASMTagWantLabel  "label"  3
 *          ASMTagWantStruc  "struc"  4
 *          ASMTagWantUnion  "union"  5
 *
 ---------------------------------------------------------------------------*/

BOOLEAN ASMSymbolWanted(index)
    int index;
{
    /* return true if the associated flag is true */
    switch (index) {
        case 1:
            return ASMTagWantProc;
            break;
        case 2:
            return ASMTagWantMacro;
            break;
        case 3:
            return ASMTagWantLabel;
            break;
        case 4:
            return ASMTagWantStruc;
            break;
        case 5:
            return ASMTagWantUnion;
            break;
        default:
            return FALSE;
            break;
    }
}


/*----------------------------------------------------------------------------
 *
 * ASMIsMember() takes the token passed and check for membership in the null
 * terminated array, tokenlist, and return TRUE if a member and FALSE
 * otherwise, index is the index into the token list of the symbol if return
 * value is TRUE
 *
 ---------------------------------------------------------------------------*/

BOOLEAN ASMIsMember(token_list, token, index)
    char token_list[][SYMBOL_SIZE];
    char *token;
    int *index;
{
    int old_case_fold = case_fold;

    /* use non case sensitive string compare */
    case_fold = 1;

    /* look for dirty rejection */
    if (!strchr(token_list[0], tolower(token[0])))
        return FALSE;

    /* march through array until membership is determined */
    for (*index = 1; *token_list[*index]; (*index)++) {

        /* return true if token found */
        if (!strfcmp(token, token_list[*index])) {
            case_fold = old_case_fold;
            return TRUE;
        }
    }

    /* did not find it */
    case_fold = old_case_fold;
    return FALSE;
}


/*----------------------------------------------------------------------------
 *
 * ASM_get_token() will obtain the next token in the line pointed to by lptr
 * and in addition will return FALSE if EOL is reached or a comment character
 * is the first non whitespace character found
 *
 ---------------------------------------------------------------------------*/

BOOLEAN ASM_get_token(lptr, token)
    char **lptr;
    char *token;
{
    char *s;                    /* start location in string */
    int token_length;           /* the length of the current token */
    int dummy;                  /* a temporary variable */

    /* loop until we have a valid token or end of string */
    do {
        /* move past whitespace */
        while (IsWhite(**lptr)) {
            (*lptr)++;
        }

        /* return false if end of line */
        if (!**lptr)
            return FALSE;

        /* check if comment */
        if (**lptr == COMMENT_CHAR) {
            return FALSE;
        }

        /* check of delimiter token */
        if (strchr(ASM_delim_Sym, **lptr)) {
            token[0] = **lptr;
            token[1] = '\0';
            (*lptr)++;
        }
        else {

            /* save the beginning location */
            s = *lptr;

            /* move to the next delimiter in the line */
            while (!IsDelim(**lptr)) {
                (*lptr)++;
            }

            /* get the token */
            token_length = *lptr - s;
            strncpy(token, s, token_length);
            token[token_length] = '\0';
        }

    } while (ASMIsMember(ASM_NOP_Sym, token, &dummy));

    return TRUE;
}


/*----------------------------------------------------------------------------
 *
 * getline() obtain the next line in the buffer
 *
 ---------------------------------------------------------------------------*/

BOOLEAN getline(inbuf, line)
    char *inbuf;
    char *line;
{
    char *oldbuf = bufname;
    int cur_point = point;

    bufname = inbuf;

    nl_forward();
    if (cur_point != point) {
        grab(cur_point, point, line);
    }
    else {
        return FALSE;
    }

    bufname = oldbuf;
    return TRUE;
}


/*----------------------------------------------------------------------------
 *
 * output_tag() places the tag in the correct format into the output buffer
 * by a call to add_tag()
 *
 ---------------------------------------------------------------------------*/

output_tag(outbuf, line, symbol, infname, line_number, char_number)
    char *outbuf;
    char *line;
    char *symbol;
    char *infname;
    int line_number;
    int char_number;
{
    /* this is just a shell call to add_tag() defined in tags.e but is an
     * ideal place to add code for other output formats or extra output
     * information etc. */
    add_tag(symbol, infname, char_number);
    return;
}


/*----------------------------------------------------------------------------
 *
 * ASMtags() tags an input stream assuming input format of ASM 80x86 format
 * in MASM/TASM syntax
 *
 ---------------------------------------------------------------------------*/

ASMTags(inbuf, infname, outbuf)
    char *inbuf;
    char *infname;
    char *outbuf;
{
    State state;                /* the current state of the parser */

    char line[256];             /* the current input line */
    char cur_token[256];        /* the current token */
    char prev_token[256];       /* the previous token */

    char *lptr;                 /* pointer into line for token parser */
    char *prev_lptr;            /* pointer into line for previous token */

    int line_number;            /* the current line in the file */
    int line_length;            /* the length of the current line */
    int char_number;            /* the current character in the file */

    int symbol_index;           /* the index into the token list of the
                                 * symbol */

    char *oldbuf = bufname;
    int *oldpoint;
    int *oldmark;

    /* save current buffer state */
    oldpoint = alloc_spot();
    exchange_point_and_mark();
    oldmark = alloc_spot();

    /* init the engine */
    ASMParserInit();
    cur_token[0] = '\0';
    prev_token[0] = '\0';
    state = Discard;
    line_number = 0;
    line_length = 0;
    char_number = 0;
    lptr = prev_lptr = (char *) NULL;

    for (;;) {

        switch (state) {

            case Discard:       /* current line is not valid */

                /* if EOF then return */
                if (getline(inbuf, line)) {
                    lptr = line;

                    /* increment counters */
                    line_number++;

                    /* char_number increments by length of previous line */
                    char_number += line_length;

                    /* line length */
                    line_length = strlen(line);
                    state = Parse1;
                }
                else {
                    state = Exit;
                }
                break;

            case Parse1:        /* parsing for first *special* token */

                /* get the next valid token */
                if (!ASM_get_token(&lptr, cur_token)) {

                    /* if no token left or a comment as first non white space
                     * char in remainder of line */
                    state = Discard;
                }
                else {

                    /* move the cur_token to prev_token */
                    strcpy(prev_token, cur_token);

                    /* check for membership in the tagging symbol club */
                    if (ASMIsMember(ASM_sym, cur_token, &symbol_index)) {
                        state = Symbol1;
                    }
                    else {

                        /* check if comment block */
                        if (ASMIsMember(ASM_comment_block,
                                        cur_token, &symbol_index)) {

                            /* get the next non white character, this makes
                             * the assumption that the delimiter character is
                             * on the same line as the comment symbol. If the
                             * delimiter character is not on the current line
                             * then parsing continues normally on the next
                             * line. */
                            while (IsWhite(*lptr)) {
                                lptr++;
                            }

                            if (*lptr) {

                                /* this is the delimiter character, store it
                                 * and move lptr past it */
                                *cur_token = *lptr;
                                lptr++;

                                /* move over comment block, remembering to
                                 * update line info as we go */
                                while (*lptr != *cur_token) {

                                    /* get a new line if end of line */
                                    if (!*lptr) {
                                        if (!getline(inbuf, line)) {
                                            *cur_token = *lptr;
                                        }
                                        else {
                                            lptr = line;

                                            /* increment counters */
                                            line_number++;

                                            /* char_number increments by
                                             * length of previous line */
                                            char_number += line_length;

                                            /* line length */
                                            line_length = strlen(line);
                                        }
                                    }
                                    else {
                                        lptr++;
                                    }
                                }
                            }

                            state = Discard;
                        }
                        else {

                            /* nothing special, parse the next symbol */
                            state = Parse2;
                        }
                    }
                }
                break;

            case Parse2:        /* parsing for second *special* token */

                /* save the previous position */
                prev_lptr = lptr;

                /* get the next token */
                if (!ASM_get_token(&lptr, cur_token)) {

                    /* no token left, reset machine */
                    state = Discard;
                }
                else {

                    if (ASMIsMember(ASM_sym, cur_token, &symbol_index)) {

                        /* found a major symbol */
                        state = Symbol2;
                    }
                    else {

                        if (ASMIsMember(ASM_def, cur_token, &symbol_index)) {

                            /* found a defining token */
                            state = Define;
                        }
                        else {
                            state = Discard;
                        }
                    }
                }

                break;

            case Symbol1:       /* next token, ignore if no token found */

                /* get the next symbol and output it */
                if (ASM_get_token(&lptr, cur_token)) {

                    if (ASMSymbolWanted(symbol_index)) {
                        output_tag(outbuf, line, cur_token, infname,
                                   line_number, char_number +
                                   lptr - line -
                                   strlen(cur_token));
                    }
                }

                /* reset machine */
                state = Discard;

                break;

            case Symbol2:       /* previous token was the wanted symbol */

                /* the previous token is the symbol of interest */
                if (ASMSymbolWanted(symbol_index)) {
                    output_tag(outbuf, line, prev_token, infname,
                               line_number, char_number +
                               prev_lptr - line -
                               strlen(prev_token));
                }

                /* reset machine */
                state = Discard;

                break;

            case Define:        /* previous token was the wanted symbol */

                /* the previous token is the symbol of interest */
                if ((ASMTagWantDefine && symbol_index != 1) ||
                    (ASMTagWantLabel && symbol_index == 1)) {
                    output_tag(outbuf, line, prev_token, infname,
                               line_number, char_number +
                               prev_lptr - line -
                               strlen(prev_token));
                }

                /* reset machine */
                state = Discard;

                break;

            case Exit:          /* clean it up */

                /* restore original location */
                bufname = oldbuf;
                point = *oldpoint;
                mark = *oldmark;
                free_spot(oldpoint);
                free_spot(oldmark);
                return;
                break;

            default:            /* not reached */
                break;
        }
    }
}

/*----------------------------------------------------------------------------
 *
 * tag_suffix_asm() and tag_suffix_inc() are recognized procedure names
 * to the tags package in Epsilon and will be called automatically when
 * tagging needs to happen for these extensions.  tag_suffix_asm() is a
 * replacement for the routine of the same name defined in tags.e and
 * tag_suffix_inc() is new.
 *
 ---------------------------------------------------------------------------*/


tag_suffix_asm()
{
    /* the third parameter, the output buffer name is not actually used by
     * anyone but is left here for a time when this information may be
     * needed.  The current algorithm is to let the funtion add_tag() decide
     * the buffer name to send the output to.  As a little more than
     * coincedence, the name used here is the same used in add_tag() defined
     * in tags.e */
    ASMTags(bufname, filename, "-tags");
}

tag_suffix_inc()
{
    tag_suffix_asm();
}

/* rebuild the default character maps */
when_loading()
{
#define UCLC(up, low)   def_char_class[low] = C_LOWER, \
                        def_char_class[up] = C_UPPER, \
                        def_srch_case_map[up] = low, \
                        def_case_map[low] = up, \
                        def_case_map[up] = low

    int i, j;

    for (i = 0; i < 256; i++)
        def_case_map[i] = def_srch_case_map[i] = i;
    for (i = 'A', j = 'a'; i <= 'Z'; i++, j++)
        UCLC(i, j);
    for (i = 131; i < 154; i++)
        def_char_class[i] = C_LOWER;
    for (i = 160; i < 164; i++)
        def_char_class[i] = C_LOWER;
	UCLC('', '');
    UCLC('', '');
    UCLC('', '');
    UCLC('', '');
    UCLC('', '');
    UCLC('', '');
    UCLC('', '');
    UCLC('', '');
}
