/* This is the header file for the Tokenizer module. It is also included by
the Cases and Parser modules. */
/* string and character constants */
#define Quoted_LaTeX_Characters "$&%"
#define Comment_Character '%'
#define Quote_Character '"'
#define Equals_Character '='
#define Hyphen_Character '-'
#define Tab_Character '\t'
#define Vertical_Tab_Character '\v'
#define Form_Feed_Character '\f'
#define Backslash_Character '\\'
/* enumerated types */
typedef enum {
TK_KEYWORD,
TK_IDENTIFIER,
TK_STRING,
TK_YEAR,
TK_ATTRIBUTE_VECTOR,
TK_EQUALS,
TK_EOF
} token_type;
typedef enum {
KW_AREA,
KW_ATTRIBUTE,
KW_CASE,
KW_CITATION,
KW_CLOSING,
KW_COURT,
KW_EXTERNAL,
KW_FACTS,
KW_HELP,
KW_HIERARCHY,
KW_IDEAL,
KW_NO,
KW_OPENING,
KW_QUESTION,
KW_RESULT,
KW_RESULTS,
KW_SUMMARY,
KW_UNKNOWN,
KW_YEAR,
KW_YES
} keyword_type;
/* structure type */
typedef struct {
cardinal line_number,
column_number;
token_type token;
union {
keyword_type keyword;
string identifier,
string;
cardinal year;
matrix_element *matrix_head;
} details;
} token_details;
/* external function */
extern token_details
Get_Token(
file in_stream,
file log_stream);
/* This is the implementation file for the Tokenizer module. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "shyster.h"
#include "cases.h"
#include "tokenizer.h"
static void
error_exit(
file stream,
string message,
token_details *token)
{
char full_message[Max_Error_Message_Length];
sprintf(full_message, "%s [%u,%u]", message, token->line_number,
token->column_number);
Write_Error_Message_And_Exit(stream, "Tokenizer", full_message);
}
static void
warning(
file stream,
const string message,
const token_details *token)
{
char full_message[Max_Error_Message_Length];
sprintf(full_message, "%s [%u,%u]", message, token->line_number,
token->column_number);
Write_Warning_Message(stream, "Tokenizer", full_message, Top_Level);
}
static int
get_char(
file in_stream,
cardinal *line_number,
cardinal *column_number,
boolean *eof)
/* Returns the next character from in_stream. Adjusts *line_number and
*column_number appropriately. Sets eof to TRUE, if the end of in_stream
has been encountered (i.e. the character returned is EOF). */
{
int ch;
if (!(*eof = (ch = getc(in_stream)) == EOF))
if (ch == Carriage_Return_Character) {
(*line_number)++;
*column_number = 0;
} else
(*column_number)++;
return ch;
}
static void
unget_char(
file in_stream,
file log_stream,
int ch,
token_details *token,
cardinal *line_number,
cardinal *column_number)
/* Pushes ch back onto in_stream. Adjusts *line_number and *column_number
appropriately. */
{
char message[Max_Error_Message_Length];
if (ch == Carriage_Return_Character)
(*line_number)--;
else
(*column_number)--;
if (ch != EOF)
if (ungetc((int) ch, in_stream) == EOF) {
sprintf(message, "ungetc failed with character `%c'", ch);
error_exit(log_stream, message, token);
}
}
static boolean
is_whitespace(
int ch)
/* Returns TRUE, iff ch is a whitespace character (a space, a tab, a vertical
tab, a carriage return, or a form feed). */
{
return ((ch == Space_Character) || (ch == Tab_Character) ||
(ch == Vertical_Tab_Character) || (ch == Carriage_Return_Character) ||
(ch == Form_Feed_Character));
}
static boolean
is_alpha(
int ch)
/* Returns TRUE, iff ch is an alphabetic character (A ... Z, a ... z). */
{
return (((ch >= Big_A_Character) && (ch <= Big_Z_Character)) ||
((ch >= Little_A_Character) && (ch <= Little_Z_Character)));
}
static void
get_keyword_or_ident(
file in_stream,
file log_stream,
int ch,
token_details *token,
cardinal *line_number,
cardinal *column_number,
boolean *eof)
/* Gets an identifier, which may be a keyword (the first character of the
identifier - ch - has just been read). Changes the structure pointed to
by token: sets token->token to TK_KEYWORD or TK_IDENTIFIER, and sets
token->details appropriately.
EBNF: identifier = letter { letter | digit | "-" }. */
{
cardinal length = 1;
string identifier;
char message[Max_Error_Message_Length];
/* allocate memory for the identifier */
if ((identifier = (string) malloc((Max_Identifier_Length + 1) * sizeof(char))) == NULL)
error_exit(log_stream, "malloc failed during keyword/identifier handling",
token);
/* put up to Max_Identifier_Length characters into the identifier */
identifier[0] = ch;
ch = get_char(in_stream, line_number, column_number, eof);
while ((length < Max_Identifier_Length) &&
(is_alpha(ch) || Is_Digit(ch) || (ch == Hyphen_Character))) {
identifier[length++] = ch;
ch = get_char(in_stream, line_number, column_number, eof);
}
identifier[length] = Null_Character;
if (is_alpha(ch) || Is_Digit(ch) || (ch == Hyphen_Character)) {
/* there is more of the identifier, so warn the user and skip over the
rest of it */
sprintf(message, "identifier truncated to \"%s\"", identifier);
warning(log_stream, message, token);
while (is_alpha(ch) || Is_Digit(ch) || (ch == Hyphen_Character))
ch = get_char(in_stream, line_number, column_number, eof);
} else
/* reallocate (just enough) memory for the identifier */
if ((identifier = (string) realloc((void *) identifier, length * sizeof(char))) ==
NULL)
error_exit(log_stream, "realloc failed during keyword/identifier handling",
token);
/* push the first character after the identifier back onto in_stream */
unget_char(in_stream, log_stream, ch, token, line_number, column_number);
/* check whether the identifier is a keyword */
if (!strcmp(identifier, "AREA")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_AREA;
} else if (!strcmp(identifier, "ATTRIBUTE")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_ATTRIBUTE;
} else if (!strcmp(identifier, "CASE")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_CASE;
} else if (!strcmp(identifier, "CITATION")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_CITATION;
} else if (!strcmp(identifier, "CLOSING")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_CLOSING;
} else if (!strcmp(identifier, "COURT")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_COURT;
} else if (!strcmp(identifier, "EXTERNAL")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_EXTERNAL;
} else if (!strcmp(identifier, "FACTS")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_FACTS;
} else if (!strcmp(identifier, "HELP")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_HELP;
} else if (!strcmp(identifier, "HIERARCHY")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_HIERARCHY;
} else if (!strcmp(identifier, "IDEAL")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_IDEAL;
} else if (!strcmp(identifier, "NO")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_NO;
} else if (!strcmp(identifier, "OPENING")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_OPENING;
} else if (!strcmp(identifier, "QUESTION")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_QUESTION;
} else if (!strcmp(identifier, "RESULT")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_RESULT;
} else if (!strcmp(identifier, "RESULTS")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_RESULTS;
} else if (!strcmp(identifier, "SUMMARY")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_SUMMARY;
} else if (!strcmp(identifier, "UNKNOWN")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_UNKNOWN;
} else if (!strcmp(identifier, "YEAR")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_YEAR;
} else if (!strcmp(identifier, "YES")) {
token->token = TK_KEYWORD;
token->details.keyword = KW_YES;
} else {
/* the identifier is not a keyword */
token->token = TK_IDENTIFIER;
token->details.identifier = identifier;
}
}
static void
get_string(
file in_stream,
file log_stream,
token_details *token,
cardinal *line_number,
cardinal *column_number,
boolean *eof)
/* Gets a string (a " character has just been read). Treats a pair of
consecutive " characters as a single " character. Treats consecutive
whitespace characters as a single space character. Sets token->details
appropriately (token->token has already been set to TK_STRING).
EBNF: string = """" character { character } """". */
{
int ch,
next_ch;
string temp_string;
cardinal allocated_length,
actual_length;
allocated_length = String_Increment;
actual_length = 0;
/* allocate memory for the string */
if ((temp_string = (string) malloc(allocated_length * sizeof(char))) == NULL)
error_exit(log_stream, "malloc failed during string handling", token);
/* get the first character of the string */
ch = get_char(in_stream, line_number, column_number, eof);
for (;;) {
if (ch == EOF)
error_exit(log_stream, "end of file in string", token);
if (strchr(Quoted_LaTeX_Characters, ch) != NULL) {
/* the character is one of those in Quoted_LaTeX_Characters (i.e. it
is $, & or %); it has a special meaning in LaTeX and needs to be
prefixed in the string by a \ character */
temp_string[actual_length++] = Backslash_Character;
if (actual_length == allocated_length)
/* the string is too long for temp_string, so reallocate some
more memory */
if ((temp_string = (string) realloc((void *) temp_string,
(allocated_length += String_Increment) *
sizeof(char))) == NULL)
error_exit(log_stream, "realloc failed during string handling",
token);
}
if (ch == Quote_Character)
/* the character is a " character */
if ((next_ch = get_char(in_stream, line_number, column_number, eof)) !=
Quote_Character) {
/* the next character is not a " character so this is the end of
the string; push the first character after the string back
onto in_stream */
unget_char(in_stream, log_stream, next_ch, token, line_number, column_number);
if (actual_length == 0)
error_exit(log_stream, "empty string", token);
else {
temp_string[actual_length++] = Null_Character;
if (actual_length < allocated_length)
/* reallocate (just enough) memory for the string */
if ((temp_string = (string) realloc((void *) temp_string,
actual_length * sizeof(char))) == NULL)
error_exit(log_stream, "realloc failed during string handling",
token);
}
token->details.string = temp_string;
return;
}
if (is_whitespace(ch)) {
/* skip to the next non-whitespace character */
for (ch = get_char(in_stream, line_number, column_number, eof);
is_whitespace(ch);
ch = get_char(in_stream, line_number, column_number, eof));
if (ch == EOF)
error_exit(log_stream, "end of file in string", token);
/* put a single space character in the string for all of the
whitespace */
temp_string[actual_length++] = Space_Character;
/* push the non-whitespace character back onto in_stream */
unget_char(in_stream, log_stream, ch, token, line_number, column_number);
} else
temp_string[actual_length++] = ch;
if (actual_length == allocated_length)
/* the string is too long for temp_string, so reallocate some more
memory */
if ((temp_string = (string) realloc((void *) temp_string,
(allocated_length += String_Increment) *
sizeof(char))) == NULL)
error_exit(log_stream, "realloc failed during string handling", token);
/* get the next character */
ch = get_char(in_stream, line_number, column_number, eof);
}
}
static void
get_year(
file in_stream,
file log_stream,
int ch,
token_details *token,
cardinal *line_number,
cardinal *column_number,
boolean *eof)
/* Gets a year (the first digit of the year - ch - has just been read). Sets
token->details appropriately (token->token has already been set to
TK_YEAR).
EBNF: year = digit [ digit ] [ digit ] [ digit ]. */
{
cardinal digits = 1,
year = (cardinal) ch - (cardinal) Zero_Character;
for (ch = get_char(in_stream, line_number, column_number, eof);
(Is_Digit(ch) && (digits < Year_Digits));
ch = get_char(in_stream, line_number, column_number, eof)) {
year = (10 * year) + (cardinal) ch - (cardinal) Zero_Character;
digits++;
}
if (Is_Digit(ch))
error_exit(log_stream, "year has too many digits", token);
unget_char(in_stream, log_stream, ch, token, line_number, column_number);
token->details.year = year;
}
static void
get_attribute_vector(
file in_stream,
file log_stream,
token_details *token,
cardinal *line_number,
cardinal *column_number,
boolean *eof)
/* Gets an attribute vector (a left parenthesis character has just been
read). Sets token->details appropriately (token->token has already been
set to TK_YEAR).
EBNF: attribute-vector = "(" attribute-value { attribute-value } ")".
attribute-value = "Y" | "N" | "U". */
{
int ch;
matrix_element *matrix_head,
*matrix_pointer;
boolean empty = TRUE;
char message[Max_Error_Message_Length];
/* allocate memory for this matrix element (the first in the list) */
if ((matrix_head = (matrix_element *) malloc(sizeof(matrix_element))) == NULL)
error_exit(log_stream, "malloc failed during attribute vector handling",
token);
matrix_pointer = matrix_head;
/* for every character that is not a right parenthesis ... */
for (ch = get_char(in_stream, line_number, column_number, eof);
ch != Attribute_Vector_End_Character;
ch = get_char(in_stream, line_number, column_number, eof)) {
if (!empty) {
/* allocate memory for this matrix element */
if ((matrix_pointer->case_next =
(matrix_element *) malloc(sizeof(matrix_element))) == NULL)
error_exit(log_stream, "malloc failed during attribute vector handling",
token);
matrix_pointer = matrix_pointer->case_next;
}
switch (ch) {
case Yes_Character:
matrix_pointer->attribute_value = YES;
break;
case No_Character:
matrix_pointer->attribute_value = NO;
break;
case Unknown_Character:
matrix_pointer->attribute_value = UNKNOWN;
break;
default:
sprintf(message, "invalid attribute value `%c'", ch);
error_exit(log_stream, message, token);
break;
}
empty = FALSE;
matrix_pointer->case_next = NULL;
matrix_pointer->attribute_next = NULL;
}
if (empty)
error_exit(log_stream, "empty attribute vector", token);
token->details.matrix_head = matrix_head;
}
static void
skip_to_end_of_line(
file in_stream,
cardinal *line_number,
cardinal *column_number,
boolean *eof)
/* Skips over characters until the end of the line, or the end of the file,
is reached. */
{
int ch;
for (;;) {
ch = get_char(in_stream, line_number, column_number, eof);
if ((ch == EOF) || (ch == Carriage_Return_Character))
return;
}
}
extern token_details
Get_Token(
file in_stream,
file log_stream)
/* Returns details of the next token from in_stream. */
{
token_details token;
int ch;
static cardinal line_number = 1,
column_number = 0;
static boolean eof = FALSE;
char message[Max_Error_Message_Length];
for (;;) {
if (eof) {
token.token = TK_EOF;
return token;
}
/* skip to the next non-whitespace character */
for (ch = get_char(in_stream, &line_number, &column_number, &eof);
is_whitespace(ch);
ch = get_char(in_stream, &line_number, &column_number, &eof));
token.line_number = line_number;
token.column_number = column_number;
if (is_alpha(ch)) {
get_keyword_or_ident(in_stream, log_stream, ch, &token,
&line_number, &column_number, &eof);
return token;
} else if (ch == Quote_Character) {
token.token = TK_STRING;
get_string(in_stream, log_stream, &token, &line_number, &column_number, &eof);
return token;
} else if (Is_Digit(ch)) {
token.token = TK_YEAR;
get_year(in_stream, log_stream, ch, &token, &line_number, &column_number, &eof);
return token;
} else if (ch == Attribute_Vector_Begin_Character) {
token.token = TK_ATTRIBUTE_VECTOR;
get_attribute_vector(in_stream, log_stream, &token,
&line_number, &column_number, &eof);
return token;
} else if (ch == Equals_Character) {
token.token = TK_EQUALS;
return token;
} else if (ch == EOF) {
token.token = TK_EOF;
return token;
} else if (ch == Comment_Character)
skip_to_end_of_line(in_stream, &line_number, &column_number, &eof);
else {
sprintf(message, "invalid character `%c'", ch);
error_exit(log_stream, message, &token);
}
}
}
Other SHYSTER modules: Shyster, Statutes, Cases, Parser, Dumper, Checker, Scales, Adjuster, Consultant, Odometer and Reporter.