tokenizer.h

/* This is the header file for the Tokenizer module.  It is also included by
   the Cases and Parser modules. */

/* string and character constants */

#define Quoted_LaTeX_Characters "$&%"
#define Comment_Character '%'
#define Quote_Character '"'
#define Equals_Character '='
#define Hyphen_Character '-'
#define Tab_Character '\t'
#define Vertical_Tab_Character '\v'
#define Form_Feed_Character '\f'
#define Backslash_Character '\\'

/* enumerated types */

typedef enum {
   TK_KEYWORD,
   TK_IDENTIFIER,
   TK_STRING,
   TK_YEAR,
   TK_ATTRIBUTE_VECTOR,
   TK_EQUALS,
   TK_EOF
} token_type;

typedef enum {
   KW_AREA,
   KW_ATTRIBUTE,
   KW_CASE,
   KW_CITATION,
   KW_CLOSING,
   KW_COURT,
   KW_EXTERNAL,
   KW_FACTS,
   KW_HELP,
   KW_HIERARCHY,
   KW_IDEAL,
   KW_NO,
   KW_OPENING,
   KW_QUESTION,
   KW_RESULT,
   KW_RESULTS,
   KW_SUMMARY,
   KW_UNKNOWN,
   KW_YEAR,
   KW_YES
} keyword_type;

/* structure type */

typedef struct {
   cardinal line_number,
      column_number;
   token_type token;
   union {
      keyword_type keyword;
      string identifier,
         string;
      cardinal year;
      matrix_element *matrix_head;
   }  details;
} token_details;

/* external function */

extern token_details
Get_Token(
      file in_stream,
      file log_stream);

tokenizer.c

/* This is the implementation file for the Tokenizer module. */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "shyster.h"
#include "cases.h"
#include "tokenizer.h"

static void
error_exit(
      file stream,
      string message,
      token_details *token)
{
   char full_message[Max_Error_Message_Length];

   sprintf(full_message, "%s [%u,%u]", message, token->line_number,
         token->column_number);
   Write_Error_Message_And_Exit(stream, "Tokenizer", full_message);
}

static void
warning(
      file stream,
      const string message,
      const token_details *token)
{
   char full_message[Max_Error_Message_Length];

   sprintf(full_message, "%s [%u,%u]", message, token->line_number,
         token->column_number);
   Write_Warning_Message(stream, "Tokenizer", full_message, Top_Level);
}

static int
get_char(
      file in_stream,
      cardinal *line_number,
      cardinal *column_number,
      boolean *eof)

/* Returns the next character from in_stream.  Adjusts *line_number and
   *column_number appropriately.  Sets eof to TRUE, if the end of in_stream
   has been encountered (i.e. the character returned is EOF). */

{
   int ch;

   if (!(*eof = (ch = getc(in_stream)) == EOF))
      if (ch == Carriage_Return_Character) {
         (*line_number)++;
         *column_number = 0;
      } else
         (*column_number)++;
   return ch;
}

static void
unget_char(
      file in_stream,
      file log_stream,
      int ch,
      token_details *token,
      cardinal *line_number,
      cardinal *column_number)

/* Pushes ch back onto in_stream. Adjusts *line_number and *column_number
   appropriately. */

{
   char message[Max_Error_Message_Length];

   if (ch == Carriage_Return_Character)
      (*line_number)--;
   else
      (*column_number)--;
   if (ch != EOF)
      if (ungetc((int) ch, in_stream) == EOF) {
         sprintf(message, "ungetc failed with character `%c'", ch);
         error_exit(log_stream, message, token);
      }
}

static boolean
is_whitespace(
      int ch)

/* Returns TRUE, iff ch is a whitespace character (a space, a tab, a vertical
   tab, a carriage return, or a form feed). */

{
   return ((ch == Space_Character) || (ch == Tab_Character) ||
         (ch == Vertical_Tab_Character) || (ch == Carriage_Return_Character) ||
         (ch == Form_Feed_Character));
}

static boolean
is_alpha(
      int ch)

/* Returns TRUE, iff ch is an alphabetic character (A ... Z, a ... z). */

{
   return (((ch >= Big_A_Character) && (ch <= Big_Z_Character)) ||
         ((ch >= Little_A_Character) && (ch <= Little_Z_Character)));
}

static void
get_keyword_or_ident(
      file in_stream,
      file log_stream,
      int ch,
      token_details *token,
      cardinal *line_number,
      cardinal *column_number,
      boolean *eof)

/* Gets an identifier, which may be a keyword (the first character of the
   identifier - ch - has just been read).  Changes the structure pointed to
   by token: sets token->token to TK_KEYWORD or TK_IDENTIFIER, and sets
   token->details appropriately.

   EBNF:   identifier = letter { letter | digit | "-" }.                    */

{
   cardinal length = 1;
   string identifier;
   char message[Max_Error_Message_Length];

   /* allocate memory for the identifier */

   if ((identifier = (string) malloc((Max_Identifier_Length + 1) * sizeof(char))) == NULL)
      error_exit(log_stream, "malloc failed during keyword/identifier handling",
            token);

   /* put up to Max_Identifier_Length characters into the identifier */

   identifier[0] = ch;
   ch = get_char(in_stream, line_number, column_number, eof);
   while ((length < Max_Identifier_Length) &&
         (is_alpha(ch) || Is_Digit(ch) || (ch == Hyphen_Character))) {
      identifier[length++] = ch;
      ch = get_char(in_stream, line_number, column_number, eof);
   }
   identifier[length] = Null_Character;

   if (is_alpha(ch) || Is_Digit(ch) || (ch == Hyphen_Character)) {

      /* there is more of the identifier, so warn the user and skip over the
         rest of it */

      sprintf(message, "identifier truncated to \"%s\"", identifier);
      warning(log_stream, message, token);
      while (is_alpha(ch) || Is_Digit(ch) || (ch == Hyphen_Character))
         ch = get_char(in_stream, line_number, column_number, eof);

   } else

      /* reallocate (just enough) memory for the identifier */

      if ((identifier = (string) realloc((void *) identifier, length * sizeof(char))) ==
         NULL)
      error_exit(log_stream, "realloc failed during keyword/identifier handling",
            token);

   /* push the first character after the identifier back onto in_stream */

   unget_char(in_stream, log_stream, ch, token, line_number, column_number);

   /* check whether the identifier is a keyword */

   if (!strcmp(identifier, "AREA")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_AREA;
   } else if (!strcmp(identifier, "ATTRIBUTE")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_ATTRIBUTE;
   } else if (!strcmp(identifier, "CASE")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_CASE;
   } else if (!strcmp(identifier, "CITATION")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_CITATION;
   } else if (!strcmp(identifier, "CLOSING")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_CLOSING;
   } else if (!strcmp(identifier, "COURT")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_COURT;
   } else if (!strcmp(identifier, "EXTERNAL")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_EXTERNAL;
   } else if (!strcmp(identifier, "FACTS")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_FACTS;
   } else if (!strcmp(identifier, "HELP")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_HELP;
   } else if (!strcmp(identifier, "HIERARCHY")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_HIERARCHY;
   } else if (!strcmp(identifier, "IDEAL")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_IDEAL;
   } else if (!strcmp(identifier, "NO")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_NO;
   } else if (!strcmp(identifier, "OPENING")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_OPENING;
   } else if (!strcmp(identifier, "QUESTION")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_QUESTION;
   } else if (!strcmp(identifier, "RESULT")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_RESULT;
   } else if (!strcmp(identifier, "RESULTS")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_RESULTS;
   } else if (!strcmp(identifier, "SUMMARY")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_SUMMARY;
   } else if (!strcmp(identifier, "UNKNOWN")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_UNKNOWN;
   } else if (!strcmp(identifier, "YEAR")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_YEAR;
   } else if (!strcmp(identifier, "YES")) {
      token->token = TK_KEYWORD;
      token->details.keyword = KW_YES;
   } else {

      /* the identifier is not a keyword */

      token->token = TK_IDENTIFIER;
      token->details.identifier = identifier;
   }
}

static void
get_string(
      file in_stream,
      file log_stream,
      token_details *token,
      cardinal *line_number,
      cardinal *column_number,
      boolean *eof)

/* Gets a string (a " character has just been read).  Treats a pair of
   consecutive " characters as a single " character.  Treats consecutive
   whitespace characters as a single space character.  Sets token->details
   appropriately (token->token has already been set to TK_STRING).

   EBNF:   string = """" character { character } """".                      */

{
   int ch,
      next_ch;
   string temp_string;
   cardinal allocated_length,
      actual_length;

   allocated_length = String_Increment;
   actual_length = 0;

   /* allocate memory for the string */

   if ((temp_string = (string) malloc(allocated_length * sizeof(char))) == NULL)
      error_exit(log_stream, "malloc failed during string handling", token);

   /* get the first character of the string */

   ch = get_char(in_stream, line_number, column_number, eof);

   for (;;) {

      if (ch == EOF)
         error_exit(log_stream, "end of file in string", token);

      if (strchr(Quoted_LaTeX_Characters, ch) != NULL) {

         /* the character is one of those in Quoted_LaTeX_Characters (i.e. it
            is $, & or %); it has a special meaning in LaTeX and needs to be
            prefixed in the string by a \ character */

         temp_string[actual_length++] = Backslash_Character;

         if (actual_length == allocated_length)

            /* the string is too long for temp_string, so reallocate some
               more memory */

            if ((temp_string = (string) realloc((void *) temp_string,
                              (allocated_length += String_Increment) *
                              sizeof(char))) == NULL)
               error_exit(log_stream, "realloc failed during string handling",
                     token);
      }
      if (ch == Quote_Character)

         /* the character is a " character */

         if ((next_ch = get_char(in_stream, line_number, column_number, eof)) !=
               Quote_Character) {

            /* the next character is not a " character so this is the end of
               the string; push the first character after the string back
               onto in_stream */

            unget_char(in_stream, log_stream, next_ch, token, line_number, column_number);

            if (actual_length == 0)
               error_exit(log_stream, "empty string", token);
            else {
               temp_string[actual_length++] = Null_Character;
               if (actual_length < allocated_length)

                  /* reallocate (just enough) memory for the string */

                  if ((temp_string = (string) realloc((void *) temp_string,
                                    actual_length * sizeof(char))) == NULL)
                     error_exit(log_stream, "realloc failed during string handling",
                           token);
            }
            token->details.string = temp_string;

            return;
         }
      if (is_whitespace(ch)) {

         /* skip to the next non-whitespace character */

         for (ch = get_char(in_stream, line_number, column_number, eof);
               is_whitespace(ch);
               ch = get_char(in_stream, line_number, column_number, eof));

         if (ch == EOF)
            error_exit(log_stream, "end of file in string", token);

         /* put a single space character in the string for all of the
            whitespace */

         temp_string[actual_length++] = Space_Character;

         /* push the non-whitespace character back onto in_stream */

         unget_char(in_stream, log_stream, ch, token, line_number, column_number);

      } else
         temp_string[actual_length++] = ch;

      if (actual_length == allocated_length)

         /* the string is too long for temp_string, so reallocate some more
            memory */

         if ((temp_string = (string) realloc((void *) temp_string,
                           (allocated_length += String_Increment) *
                           sizeof(char))) == NULL)
            error_exit(log_stream, "realloc failed during string handling", token);

      /* get the next character */

      ch = get_char(in_stream, line_number, column_number, eof);
   }
}

static void
get_year(
      file in_stream,
      file log_stream,
      int ch,
      token_details *token,
      cardinal *line_number,
      cardinal *column_number,
      boolean *eof)

/* Gets a year (the first digit of the year - ch - has just been read).  Sets
   token->details appropriately (token->token has already been set to
   TK_YEAR).

   EBNF:   year = digit [ digit ] [ digit ] [ digit ].                      */

{
   cardinal digits = 1,
      year = (cardinal) ch - (cardinal) Zero_Character;

   for (ch = get_char(in_stream, line_number, column_number, eof);
         (Is_Digit(ch) && (digits < Year_Digits));
         ch = get_char(in_stream, line_number, column_number, eof)) {
      year = (10 * year) + (cardinal) ch - (cardinal) Zero_Character;
      digits++;
   }
   if (Is_Digit(ch))
      error_exit(log_stream, "year has too many digits", token);
   unget_char(in_stream, log_stream, ch, token, line_number, column_number);
   token->details.year = year;
}

static void
get_attribute_vector(
      file in_stream,
      file log_stream,
      token_details *token,
      cardinal *line_number,
      cardinal *column_number,
      boolean *eof)

/* Gets an attribute vector (a left parenthesis character has just been
   read).  Sets token->details appropriately (token->token has already been
   set to TK_YEAR).

   EBNF:   attribute-vector = "(" attribute-value { attribute-value } ")".
           attribute-value  = "Y" | "N" | "U".                              */

{
   int ch;
   matrix_element *matrix_head,
     *matrix_pointer;
   boolean empty = TRUE;
   char message[Max_Error_Message_Length];

   /* allocate memory for this matrix element (the first in the list) */

   if ((matrix_head = (matrix_element *) malloc(sizeof(matrix_element))) == NULL)
      error_exit(log_stream, "malloc failed during attribute vector handling",
            token);

   matrix_pointer = matrix_head;

   /* for every character that is not a right parenthesis ... */

   for (ch = get_char(in_stream, line_number, column_number, eof);
         ch != Attribute_Vector_End_Character;
         ch = get_char(in_stream, line_number, column_number, eof)) {

      if (!empty) {

         /* allocate memory for this matrix element */

         if ((matrix_pointer->case_next =
                     (matrix_element *) malloc(sizeof(matrix_element))) == NULL)
            error_exit(log_stream, "malloc failed during attribute vector handling",
                  token);
         matrix_pointer = matrix_pointer->case_next;
      }
      switch (ch) {
         case Yes_Character:
            matrix_pointer->attribute_value = YES;
            break;
         case No_Character:
            matrix_pointer->attribute_value = NO;
            break;
         case Unknown_Character:
            matrix_pointer->attribute_value = UNKNOWN;
            break;
         default:
            sprintf(message, "invalid attribute value `%c'", ch);
            error_exit(log_stream, message, token);
            break;
      }
      empty = FALSE;
      matrix_pointer->case_next = NULL;
      matrix_pointer->attribute_next = NULL;
   }
   if (empty)
      error_exit(log_stream, "empty attribute vector", token);
   token->details.matrix_head = matrix_head;
}

static void
skip_to_end_of_line(
      file in_stream,
      cardinal *line_number,
      cardinal *column_number,
      boolean *eof)

/* Skips over characters until the end of the line, or the end of the file,
   is reached. */

{
   int ch;

   for (;;) {
      ch = get_char(in_stream, line_number, column_number, eof);
      if ((ch == EOF) || (ch == Carriage_Return_Character))
         return;
   }
}

extern token_details
Get_Token(
      file in_stream,
      file log_stream)

/* Returns details of the next token from in_stream. */

{
   token_details token;
   int ch;
   static cardinal line_number = 1,
      column_number = 0;
   static boolean eof = FALSE;
   char message[Max_Error_Message_Length];

   for (;;) {

      if (eof) {
         token.token = TK_EOF;
         return token;
      }

      /* skip to the next non-whitespace character */

      for (ch = get_char(in_stream, &line_number, &column_number, &eof);
            is_whitespace(ch);
            ch = get_char(in_stream, &line_number, &column_number, &eof));

      token.line_number = line_number;
      token.column_number = column_number;

      if (is_alpha(ch)) {
         get_keyword_or_ident(in_stream, log_stream, ch, &token,
               &line_number, &column_number, &eof);
         return token;
      } else if (ch == Quote_Character) {
         token.token = TK_STRING;
         get_string(in_stream, log_stream, &token, &line_number, &column_number, &eof);
         return token;
      } else if (Is_Digit(ch)) {
         token.token = TK_YEAR;
         get_year(in_stream, log_stream, ch, &token, &line_number, &column_number, &eof);
         return token;
      } else if (ch == Attribute_Vector_Begin_Character) {
         token.token = TK_ATTRIBUTE_VECTOR;
         get_attribute_vector(in_stream, log_stream, &token,
               &line_number, &column_number, &eof);
         return token;
      } else if (ch == Equals_Character) {
         token.token = TK_EQUALS;
         return token;
      } else if (ch == EOF) {
         token.token = TK_EOF;
         return token;
      } else if (ch == Comment_Character)
         skip_to_end_of_line(in_stream, &line_number, &column_number, &eof);
      else {
         sprintf(message, "invalid character `%c'", ch);
         error_exit(log_stream, message, &token);
      }
   }
}

Other SHYSTER modules: Shyster, Statutes, Cases, Parser, Dumper, Checker, Scales, Adjuster, Consultant, Odometer and Reporter.
Copyright noticeValid HTML 4.0
Home page:  <http://www.popple.net/james/>
E-mail:  <james@popple.net>
Last modified:  30 April 1995