⟦70421f324⟧

TextFile


#ifndef INCLUDED_STREAM
#include <stream.h>
#endif
#ifndef INCLUDED_STRING
#include <string.h>
#endif
#ifndef INCLUDED_STDLIB
#include <stdlib.h>
#endif
#ifndef INCLUDED_ASSERT
#include <assert.h>
#endif
#ifndef INCLUDED_CTYPE
#include <ctype.h>
#endif

#include "clex.h"

// get string value tables, sym_str[] and keyword[] :
#define CLEX_IMPLEMENTATION 1
#include "clex_sym.h"

/******************************************************************************
*                                                                             *
*  KWTABLE -- keyword hash table (internal use only)                          *
*     KWtable implements a collision-free hash table of C++ keywords.  The    *
*     table size and hash function are computed by use of a standalone C      *
*     program, kwhash.c, included in this directory.                          *
*                                                                             *
******************************************************************************/

#define U_short unsigned short
#define U_char  unsigned char

struct KWtable
    {
    enum { HASHSIZE = 131 };  // as computed by kwhash.c, for a=9,b=2,c=2

    struct  {
            char* kwp;
            Clex_sym sym;
            } kwhash[HASHSIZE];

    KWtable(char**);
    U_short hash(const U_char*, U_short len);
    void insert(char*, Clex_sym);
    Clex_sym lookup(char*, short len);
    };

static KWtable kwt = KWtable(keywords); // keywords[] defined in Clex_sym.h

KWtable::
KWtable (char** kwl)
    {
    short int i;
    for (i = 0; i < HASHSIZE; ++i)
        kwhash[i].kwp = NULL;
    for (i = 0; i < CLEX_NUMKEYS; ++i)
        insert(kwl[i], KEYWORD_S + i);
    // rely on assert() to prevent hash collisions -- may need
    //  a new hash function or table size when keyword added.
    }

// the values used in the following hash function, and HASHSIZE, were
// determined by use of the standalone C program kwhash.c, to
// ensure that no collisions occur.

inline
U_short KWtable::
hash (const U_char* cp, U_short len)
    {
    return (((U_short)cp[0]         ) ^
            ((U_short)cp[1]     << 9) ^
            ((U_short)cp[len-1] << 2) ^
            (len                << 2) ) % HASHSIZE;
    }

void KWtable::
insert (char* cp, Clex_sym s)
    {
    U_short h = hash(cp, strlen(cp));
    assert(kwt.kwhash[h].kwp == NULL);  // collisions not permitted.
    kwt.kwhash[h].kwp = cp;
    kwt.kwhash[h].sym = s;
    }

Clex_sym KWtable::
lookup (char* cp, short len)
    {
    if (len < 2 || len > 9) return (IDENT_S);
    short h = hash(cp, len);
    if (kwt.kwhash[h].kwp == NULL) return (IDENT_S);
    if (strcmp(kwt.kwhash[h].kwp, cp)) return (IDENT_S);
    return (kwt.kwhash[h].sym);
    }

/******************************************************************************
*                                                                             *
*  CLEX -- c++ lexical scanner                                               *
*                                                                             *
******************************************************************************/

// CONSTRUCTOR Clex:
//   The argument block_brack, if TRUE, dictates that the contents
//   of square brackets "[]" be returned as a string in the string
//   buffer.  If false, square brackets are treated as simple tokens.

Clex::
Clex (FILE* f, Boolean b)
    {
    fp = f;
    block_brack = b;
    filename[0] = '\0';
    bufsiz = 0; buf[0] = '\0';
    // prime the pipeline:
    line_num = 0;
    look = '\n';    // be prepared to handle '#' as first char
    }

Clex_sym Clex::
num (char c)
    {
    Clex_sym s = NUM_S;

    bufsiz = 0;
    put_in_buf(c);
    while (isdigit(look))
        buf_one();

    // hexadecimal
    if (bufsiz == 1 && *buf == '0' && (look == 'x' || look == 'X'))
        {
        do { buf_one(); }
            while (isxdigit(look));
        if (look == 'L' || look == 'l' || look == 'U' || look == 'u')
            buf_one();
        return terminate(s);
        }

    // long or unsigned
    if (look == 'L' || look == 'l' || look == 'U' || look == 'u')
        { buf_one(); return terminate(NUM_S); }

    // floating point
    else if (look == '.')
        {
        s = FLOATNUM_S;
        do { buf_one(); }
            while (isdigit(look));
        }

    // scientific notation
    if (look == 'e' || look == 'E')
         {
         s = FLOATNUM_S;
         do { buf_one(); }
            while (isdigit(look));
         }
    else
        return terminate(s);

    if (look == '+' || look == '-')
         do { buf_one(); }
            while (isdigit(look));
    return terminate(s);
    }

Clex_sym Clex::
ident (char first)
    {
    register Boolean maybe_kw = TRUE;
    register short bs = 0;
    buf[bs++] = first;
    while (isalnum(look) || look == '_' || look == '$')
        {
        // note: this function accounts for 30% of the total scan time
        if (maybe_kw && (isupper(look) || look == '_' ))
            maybe_kw = FALSE;
        buf[bs++] = look;       // don't worry about overflow
        eat_one();
        }
    buf[bs] = '\0';
    bufsiz = bs;

    if (maybe_kw)
        return kwt.lookup(buf, bufsiz);
    return IDENT_S;
    }

Clex_sym Clex::
quote (char c, Clex_sym s, Clex_mode m)
    {
    if (m == CL_NONE)
        bufsiz = 0;
    while (look != c)
        {
        if (look == EOF)
            { return terminate(ERROR_EOF_S); }
        else if (look == '\n')
            { return terminate(ERROR_EOLN_S); }
        else if (look == '\\')
            {
            eat_one();
            if (look == '\n')
                { eat_one(); eoln(m|CL_QUOTE); continue; }
            else if (look == EOF)
                { return terminate(ERROR_EOF_S); }
            else
                put_in_buf('\\');   // this handles \' and \" too.
            }
        buf_one();
        }
    eat_one();  // eat the closing quote
    return terminate(s);
    }


// lbrack() accumulates the contents between "[" and "]" into
//  the string buffer, handling syntactically quoted strings,
//  comments, and nested brackets.  Note that lbrack() is
//  called recursively in the case of nested brackets.

Clex_sym Clex::
lbrack (Clex_mode m)
    {
    if (m == CL_NONE)
        bufsiz = 0;
    while (look != ']')
        {
        if (look == EOF)
            return terminate(ERROR_EOF_S);

        else if (look == '\n')
            { eat_one(); eoln(m|CL_BRACK); }
        else if (look == '[')
            {
            buf_one();
            if (lbrack(m|CL_BRACK) == ERROR_EOF_S)
                return ERROR_EOF_S;     // already cleaned up.
            else put_in_buf(']');
            }
        else if (look == '\'' || look == '"')
            {
            char c = look;
            buf_one();
            (void) quote(c, NONE_S, m|CL_BRACK);
            put_in_buf(c);
            }
        else if (look == '/')           // maybe a comment
            {
            eat_one();
            if (look == '/')
                line_comment();
            else if (look == '*')
                {
                block_comment(m|CL_BRACK);
                if (look == EOF) return terminate(ERROR_EOF_S);
                }
            else                        // stash the '/' and the char after
                { put_in_buf('/'); buf_one(); }
            }
        else                            // just a character to save
            buf_one();
        }

    eat_one(); // eat the ']'.
    return terminate(LBRACK_S);
    }


void Clex::
block_comment(Clex_mode m)
    {
    eat_one(); // eat the '*'
    while (! (look == '*' && (eat_one(), look == '/')) )
        {
        if (look == EOF) return;
        if (look == '\n') { eat_one(); eoln(m|CL_COMMENT); }
        else if (look != '*') eat_one();
        }
    eat_one(); // eat the '/'
    }

void Clex::
line_comment()
    {
    do { eat_one(); }
     while (look != '\n' && look != EOF);
    }

// eat_return() is intended to save space in Clex::next() -- the
//  inline function eat_one() produces quite a lot of code.
Clex_sym Clex::
eat_return(Clex_sym s)
    { eat_one(); return s; }

Clex_sym Clex::
next()
    {
    short val;
    while (val = look, eat_one(), val != EOF)
        {
        char ch = char(val);
        switch (ch)
            {
        case ' ' : continue;

        case '_' :
        case '$' : return ident(ch);

        case '0' : case '1' : case '2' : case '3' : case '4' :
        case '5' : case '6' : case '7' : case '8' : case '9' :
                   return num(ch);

        case ',' : return COMMA_S;
        case ';' : return SEMI_S;
        case '[' : if (block_brack) return lbrack(CL_NONE);
                   else             return LBRACK_S;
        case ']' : return RBRACK_S;
        case '{' : return LBRACE_S;
        case '}' : return RBRACE_S;
        case '(' : return LPAR_S;
        case ')' : return RPAR_S;
        case '~' : return TILDE_S;
        case '?' : return QUEST_S;
        case '"' : return quote(ch, QUOTE_S, CL_NONE);
        case '\'': return quote(ch, APOS_S, CL_NONE);

        case '=' :                              // '=', '=='
            if (look != '=') return AS_S;
            else  return eat_return(EQ_S);

        case ':' :                              // ":", "::"
            if (look != ':') return COLON_S;
            else  return eat_return(SCOPE_S);

        case '!' :                              // "!", "!="
            if (look != '=') return BANG_S;
            else  return eat_return(NE_S);

        case '^' :                              // "^", "^="
            if (look != '=') return CARET_S;
            else  return eat_return(XORAS_S);

        case '*' :                              // '*', '*='
            if (look != '=') return STAR_S;
            else  return eat_return(MULAS_S);

        case '%' :                              // '%', '%='
            if (look != '=') return MOD_S;
            else  return eat_return(MODAS_S);

        case '|' :                              //  "|=", "||", "|"
            if      (look == '|') return eat_return(LOR_S);
            else if (look == '=') return eat_return(ORAS_S);
            else                             return VBAR_S;

        case '&' :                              // "&", "&=", "&&"
            if      (look == '&') return eat_return(LAND_S);
            else if (look == '=') return eat_return(ANDAS_S);
            else                             return AMPER_S;

        case '+' :                              // '+', '++', '+='
            if      (look == '+') return eat_return(INCRE_S);
            else if (look == '=') return eat_return(ADDAS_S);
            else                             return PLUS_S;

        case '-' :                              // '--', '-=', '->', '-', 
            if      (look == '-') return eat_return(DECRE_S);
            else if (look == '=') return eat_return(SUBAS_S);
            else if (look == '>') return eat_return(DEREF_S);
            else                             return MINUS_S;

        case '/' :                              // '/*', '//', '/=', '/'
            if (look == '*')
                {
                block_comment(CL_NONE);
                if (look == EOF)       // almost certainly a mistake:
                    return ERROR_EOF_S;
                else continue;
                }
            else if (look == '/')
                { line_comment(); continue; }
            else if (look == '=') return eat_return(DIVAS_S);
            else                             return SLASH_S;

        case '.' :                              // ".", "..."
            if (isdigit(look))      return num(ch);
            else if (look == '.')
                {
                eat_one();          // check for "..", undefined.
                if (look != '.')    return ERROR_UNKN_S;
                else    return  eat_return(ELLIP_S);
                }
            else                    return DOT_S;

        case '<' :                              // '<=', '<', '<<', '<<='
            if      (look == '=')   return eat_return(LE_S);
            else if (look == '<')
                {
                eat_one();
                if  (look != '=')   return SHL_S;
                else     return eat_return(SHLAS_S);
                }
            else                    return LT_S;

        case '>' :                              // '>=', '>', '>>', '>>='
            if      (look == '=')   return eat_return(GE_S);
            else if (look == '>')
                {
                eat_one();
                if  (look != '=')   return SHR_S;
                else     return eat_return(SHRAS_S);
                }
            else                    return GT_S;

        default:
            if (isalpha(ch))
                return ident(ch);
            if (ch == '\n')
                eoln(CL_NONE);
            else if (iscntrl(ch))
                continue;
            else
                return ERROR_UNKN_S;
            }
        }

    return EOF_S;
    }

struct Quickbuf
    {
    short len;
    char line[10240];
    void put_in(char c) { if (len < sizeof(line)-1) line[len++] = c; }
    void terminate()    { line[len] = '\0'; }
    Quickbuf() { len = 0; }
    };

void Clex::
eoln(Clex_mode m)
    {
    // assume NL character already eaten.
    ++line_num;
    // don't process '#' lines in quotes, comments, or '#' continuations.
    if (m & (CL_QUOTE|CL_POUND|CL_COMMENT))
        return;

    // eat whitespace
    while (look != EOF && look != '\n')
        {
        if (look == ' ' || iscntrl(char(look))) eat_one();
        else break;
        }
    if (look != '#')
        return;

    // eat the '#' and subsequent whitespace
    do { eat_one(); if (look == EOF || look == '\n') break; }
       while (look == ' ' || iscntrl(char(look)));

    // collect the '#' line
    Quickbuf b;
    do  {   // record line
        if (look == '\\')       // check for continuation line
            {
            eat_one();
            if (look == '\n') { eat_one(); eoln(m|CL_POUND); }
            else { b.put_in('\\'); }
            }
        else if (look == '/')   // check for comment in '#' line
            {
            eat_one();
            if (look == '*')
                {
                block_comment(m|CL_POUND);
                if (look == EOF) break;
                }
            else if (look == '/') line_comment();
            else { b.put_in('/'); }
            }
        else
            {
            if (iscntrl(char(look))) look = ' ';
            b.put_in(look);
            eat_one();
            }
 
        } while (look != '\n' && look != EOF);
    b.terminate();

    (void) pound(m, b.line, b.len);     // call virtual handler
    }

Boolean Clex::
pound (Clex_mode m, char* line, short len)
    {
    void(m);                // to keep cfront blissful
    char* cp = line;
    if (!isdigit(*cp))
        {
        if (len < 5) return FALSE;
        if (strncmp(cp, "line ", 5) != 0)
            return FALSE;   // don't know what it is
        cp += 4;
        while (*cp == ' ') ++cp;
        if (!isdigit(*cp))
            return FALSE;
        }

    // # <line> "<filename>"   or    #line <line> "<filename>"
    line_num = atoi(cp) - 1;    // will be incremented by eoln() later
    while (isdigit(*cp)) ++cp;
    while (*cp == ' ')   ++cp;
    if (*cp == '"')
        {
        char* cpq = cp;
        do { ++cpq; }
           while (*cpq != '"' && *cpq != '\0');
        strncpy(filename, cp+1, cpq - cp - 1);
        filename[cpq - cp - 1] = '\0';
        }

    return TRUE;
    }

const char* Clex::
debug (Clex_sym s)
    {
    return (s >= KEYWORD_S) ? keywords[s - KEYWORD_S] : sym_str[s] ;
    }
DataMuseum.dk

DKUUG/EUUG Conference tapes

⟦70421f324⟧ TextFile

Derivation

TextFile