summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--Makefile8
-rw-r--r--main.c685
3 files changed, 634 insertions, 60 deletions
diff --git a/.gitignore b/.gitignore
index 1dabaad..97ce826 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ main
m68k-trasm
cmake[-_]build*/
build*/
+compile_commands.json
diff --git a/Makefile b/Makefile
index 5b1111e..d13a5b3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,12 @@
# SPDX-License-Identifier: Unlicense
WARNFLAGS = -Wall -Wextra -pedantic -Wlogical-op
-OPTFLAGS = -O2
INCLUDES = lib
-_CFLAGS = $(CFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(OPTFLAGS) -pipe -g
-_CXXFLAGS = $(CXXFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(OPTFLAGS) -pipe -g
+_FLAGS = -O2 -fsanitize=address
+_CFLAGS = $(CFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(_FLAGS) -pipe -g
+_CXXFLAGS = $(CXXFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(_FLAGS) -pipe -g
LDSCRIPTS =
-_LDFLAGS = $(LDFLAGS) $(OPTFLAGS) $(addprefix -T,$(LDSCRIPTS))
+_LDFLAGS = $(LDFLAGS) $(_FLAGS) $(addprefix -T,$(LDSCRIPTS))
OBJECTS=main.o
diff --git a/main.c b/main.c
index 3020862..b8b6fff 100644
--- a/main.c
+++ b/main.c
@@ -12,55 +12,90 @@
#include <stdlib.h>
#include <stdint.h>
-#define OK 0
-#define ERR 2
-
-enum tok_kind {
- TK_NONE = 0,
- TK_SPACE,
- TK_TAB,
- TK_CR,
- TK_LF,
- TK_DOT,
- TK_COMMA,
- TK_PLUS,
- TK_MINUS,
- TK_EQ,
- TK_COLON,
- TK_ASTERISK,
- TK_STRING,
- TK_ALNUM,
- TK_NUMDEC,
- TK_NUMOCT,
- TK_NUMHEX,
- TK_PARENL,
- TK_PARENR,
- TK_COMMENT_ASTERISK,
- TK_COMMENT_SEMICOLON,
+#ifndef TRACE_LEX
+#define TRACE_LEX 1
+#endif
+
+#define ERR 0
+#define OK 1
+#define CONTINUE 2
+
+enum token_type {
+ TT_NONE = 0,
+ TT_SPACE,
+ TT_TAB,
+ TT_NEWLINE,
+ TT_DOT,
+ TT_COMMA,
+ TT_PLUS,
+ TT_MINUS,
+ TT_EQ,
+ TT_COLON,
+ TT_PERCENT,
+ TT_HASH,
+ TT_ASTERISK,
+ TT_STRING,
+ TT_IDENTIFIER,
+ TT_NUMDEC,
+ TT_NUMOCT,
+ TT_NUMHEX,
+ TT_LPAREN,
+ TT_RPAREN,
+ TT_COMMENT_ASTERISK,
+ TT_COMMENT_SEMICOLON,
};
-struct tok {
- enum tok_kind kind;
+struct token {
+ enum token_type type;
size_t offset;
size_t length;
};
enum lex_error {
LE_NONE = 0,
+ LE_SOME,
+};
+
+enum lex_state {
+ LS_FREE = 0,
+ LS_CR,
+ LS_SPACE,
+ LS_TAB,
+ LS_IDENTIFIER,
+ LS_NUMOCTHEX,
+ LS_NUMOCT,
+ LS_NUMHEX,
+ LS_NUMDEC,
+ LS_STRING,
+ LS_STRING_ESC,
+ LS_COMMENT_ASTERISK,
+ LS_COMMENT_SEMICOLON,
+ LS_ERROR,
+ LS_EOF,
+};
+
+struct line_pos_info {
+ unsigned long line_num;
+ unsigned long column_num;
+ unsigned long line_offset;
};
struct lex {
+ // State variables
+ enum lex_state state;
+ enum lex_error error;
+ size_t cursor;
+ size_t tok_offset;
+ size_t tokens_count;
+ bool inside_line;
// Input data buffer
- FILE *parsed_input_stream;
- char *parsed_input;
- size_t parsed_input_size;
+ FILE *input_stream;
+ char *input;
+ size_t input_size;
// Tokens table
FILE *tokbuf_stream;
- struct tok *tokbuf;
+ struct token *tokbuf;
size_t tokbuf_size;
- // State
- size_t offset;
- enum lex_error error;
};
enum stmt_kind {
@@ -131,7 +166,7 @@ struct def_endef {
};
struct stmt {
- enum stmt_kind kind;
+ enum stmt_kind type;
union {
struct instruction instruction;
int32_t align;
@@ -149,8 +184,17 @@ struct symbol {
uint32_t hash;
};
+enum pars_error {
+ PE_NONE = 0,
+ PE_LEX,
+ PE_SOME,
+};
+
struct pars {
- struct lex *lex;
+ const struct lex *lex;
+ // State
+ size_t cur_tok_id;
+ enum pars_error error;
// Statement table
FILE *stmttab_stream;
struct stmt *stmttab;
@@ -166,49 +210,567 @@ struct pars {
};
struct assem {
- struct pars *pars;
+ const struct pars *pars;
};
+const char *const g_escape_table[256] = {
+ "\\x00", "\\x01", "\\x02", "\\x03", "\\x04", "\\x05", "\\x06", "\\x07",
+ "\\x08", "\\t", "\\n", "\\x0b", "\\x0c", "\\r", "\\x0e", "\\x0f", "\\x10",
+ "\\x11", "\\x12", "\\x13", "\\x14", "\\x15", "\\x16", "\\x17", "\\x18",
+ "\\x19", "\\x1a", "\\x1b", "\\x1c", "\\x1d", "\\x1e", "\\x1f", " ", "!",
+ "\\\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "0",
+ "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "\\<", "=", "\\>", "?",
+ "@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
+ "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\\\",
+ "]", "^", "_", "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
+ "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
+ "{", "|", "}", "~", "\\x7f", "\\x80", "\\x81", "\\x82", "\\x83", "\\x84",
+ "\\x85", "\\x86", "\\x87", "\\x88", "\\x89", "\\x8a", "\\x8b", "\\x8c",
+ "\\x8d", "\\x8e", "\\x8f", "\\x90", "\\x91", "\\x92", "\\x93", "\\x94",
+ "\\x95", "\\x96", "\\x97", "\\x98", "\\x99", "\\x9a", "\\x9b", "\\x9c",
+ "\\x9d", "\\x9e", "\\x9f", "\\xa0", "\\xa1", "\\xa2", "\\xa3", "\\xa4",
+ "\\xa5", "\\xa6", "\\xa7", "\\xa8", "\\xa9", "\\xaa", "\\xab", "\\xac",
+ "\\xad", "\\xae", "\\xaf", "\\xb0", "\\xb1", "\\xb2", "\\xb3", "\\xb4",
+ "\\xb5", "\\xb6", "\\xb7", "\\xb8", "\\xb9", "\\xba", "\\xbb", "\\xbc",
+ "\\xbd", "\\xbe", "\\xbf", "\\xc0", "\\xc1", "\\xc2", "\\xc3", "\\xc4",
+ "\\xc5", "\\xc6", "\\xc7", "\\xc8", "\\xc9", "\\xca", "\\xcb", "\\xcc",
+ "\\xcd", "\\xce", "\\xcf", "\\xd0", "\\xd1", "\\xd2", "\\xd3", "\\xd4",
+ "\\xd5", "\\xd6", "\\xd7", "\\xd8", "\\xd9", "\\xda", "\\xdb", "\\xdc",
+ "\\xdd", "\\xde", "\\xdf", "\\xe0", "\\xe1", "\\xe2", "\\xe3", "\\xe4",
+ "\\xe5", "\\xe6", "\\xe7", "\\xe8", "\\xe9", "\\xea", "\\xeb", "\\xec",
+ "\\xed", "\\xee", "\\xef", "\\xf0", "\\xf1", "\\xf2", "\\xf3", "\\xf4",
+ "\\xf5", "\\xf6", "\\xf7", "\\xf8", "\\xf9", "\\xfa", "\\xfb", "\\xfc",
+ "\\xfd", "\\xfe",
+};
+
+static bool should_be_escaped(const int c)
+{
+ return c < ' ' || c == '"' || c == '\\' || c == '<' || c == '>' || c > '~';
+}
+
+static bool is_oct(const int c)
+{
+ return c >= '0' && c <= '7';
+}
+
+static bool is_dec(const int c)
+{
+ return c >= '0' && c <= '9';
+}
+
+static bool is_hex(const int c)
+{
+ return is_dec(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
+}
+
+static bool is_alphabetic(const int c)
+{
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+
+static bool is_alphanum(const int c)
+{
+ return is_dec(c) || is_alphabetic(c);
+}
+
+static int printed_size(const char c)
+{
+ if (c < ' ' || c > '~') {
+ return sizeof("\\x00")-1;
+ }
+ if (c == '"' || c == '\\') {
+ return sizeof("\\\\")-1;
+ }
+ return 1;
+}
+
+static int fprint_string_escaped(
+ const char *const str, const size_t length, FILE *const stream)
+{
+ int written = 0;
+ for (size_t i = 0; i < length; i++, written += printed_size(str[i])) {
+ if (should_be_escaped(str[i])) {
+ fputs(g_escape_table[(unsigned char)str[i]], stream);
+ } else {
+ fputc(str[i], stream);
+ }
+ }
+ return written;
+}
+
+static const char *tok_kind_to_string(const enum token_type type)
+{
+ switch (type) {
+ case TT_NONE: return "NONE";
+ case TT_SPACE: return "SPACE";
+ case TT_TAB: return "TAB";
+ case TT_NEWLINE: return "NEWLINE";
+ case TT_DOT: return "DOT";
+ case TT_COMMA: return "COMMA";
+ case TT_PLUS: return "PLUS";
+ case TT_MINUS: return "MINUS";
+ case TT_EQ: return "EQ";
+ case TT_COLON: return "COLON";
+ case TT_PERCENT: return "PERCENT";
+ case TT_HASH: return "HASH";
+ case TT_STRING: return "STRING";
+ case TT_ASTERISK: return "ASTERISK";
+ case TT_IDENTIFIER: return "IDENTIFIER";
+ case TT_NUMDEC: return "NUMDEC";
+ case TT_NUMOCT: return "NUMOCT";
+ case TT_NUMHEX: return "NUMHEX";
+ case TT_LPAREN: return "PARENL";
+ case TT_RPAREN: return "PARENR";
+ case TT_COMMENT_ASTERISK: return "COMMENT";
+ case TT_COMMENT_SEMICOLON: return "COMMENT";
+ }
+ assert(0);
+ return "UNKNOWN";
+}
+
+static int fprint_tok(const char *const input, struct token *token, FILE *const stream)
+{
+ int res = fprintf(stream, "%s<", tok_kind_to_string(token->type));
+ if (res == -1) {
+ return -1;
+ }
+ int written = res;
+ res = fprint_string_escaped(input + token->offset, token->length, stream);
+ if (res == -1) {
+ return -1;
+ }
+ written += res;
+ res = fputs(">\n", stream);
+ if (res == -1) {
+ return -1;
+ }
+ written += res;
+ return written;
+}
+
static int lex_init(struct lex *const self)
{
- (void) self;
+ *self = (struct lex){
+ .input_stream = open_memstream(&self->input, &self->input_size),
+ .tokbuf_stream = open_memstream(
+ (char **)&self->tokbuf, &self->tokbuf_size),
+ };
+ assert(self->input_stream != NULL);
+ assert(self->tokbuf_stream != NULL);
return OK;
}
-static int lex_next(struct lex *const self)
+static int fwrite_token(const struct token *const token, FILE *const stream)
{
- (void) self;
+ const int res = fwrite(token, sizeof *token, 1, stream);
+ assert(res == 1);
+ return res;
+}
+
+static void lex_yield_token(struct lex *const self, const struct token *const token)
+{
+ self->inside_line = token->type != TT_NEWLINE;
+ fwrite_token(token, self->tokbuf_stream);
+ self->tokens_count++;
+}
+
+static const char *lex_state_error_string(
+ const enum lex_state state, const bool inside_line)
+{
+ if (!inside_line) {
+ assert(state == LS_FREE);
+ return "'*', ';', '0', '[1-9]', '[a-zA-Z_]', ',', '.', '(', ')', '+', "
+ "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' "
+ "or EOF";
+ }
+ switch (state) {
+ case LS_FREE:
+ return "';', '0', '[1-9]', '[a-zA-Z_]', ',', '.', '(', ')', '+', "
+ "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' "
+ "or EOF";
+ case LS_NUMOCTHEX:
+ return "';', '[0-7]', [xX], ',', '.', '(', ')', '+', "
+ "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' "
+ "or EOF";
+ case LS_NUMOCT:
+ return "';', '[0-7]' , ',', '.', '(', ')', '+', "
+ "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' "
+ "or EOF";
+ case LS_NUMHEX:
+ return "';', '[0-9a-zA-Z]' , ',', '.', '(', ')', '+', "
+ "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' "
+ "or EOF";
+ case LS_NUMDEC:
+ return "';', '[0-9]' , ',', '.', '(', ')', '+', "
+ "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' "
+ "or EOF";
+ case LS_CR:
+ case LS_SPACE:
+ case LS_TAB:
+ case LS_IDENTIFIER:
+ case LS_STRING:
+ case LS_STRING_ESC:
+ case LS_COMMENT_ASTERISK:
+ case LS_COMMENT_SEMICOLON:
+ case LS_ERROR:
+ case LS_EOF:
+ assert(0);
+ break;
+ }
+ return "???";
+}
+
+static struct line_pos_info lex_get_line_pos_info(const struct lex *const self)
+{
+ struct line_pos_info l = {0, 0, 0};
+ bool cr = false;
+ // `input` is null terminated, that's why we subtract 1 here
+ for (size_t i = 0; i < self->input_size - 1; i++) {
+ const char c = self->input[i];
+ if (c == '\r') {
+ cr = true;
+ l.line_offset = i + 1;
+ l.line_num++;
+ l.column_num = 0;
+ } else if (c == '\n') {
+ cr = false;
+ l.line_offset = i + 1;
+ if (!cr) {
+ l.line_num++;
+ }
+ l.column_num = 0;
+ } else {
+ cr = false;
+ l.column_num++;
+ }
+ }
+ return l;
+}
+
+static int lex_yield_error(struct lex *const self, const int c)
+{
+ fflush(self->input_stream);
+ const struct line_pos_info l = lex_get_line_pos_info(self);
+ {
+ // Read out the rest of the line
+ int c;
+ do {
+ c = getc(stdin);
+ const char c_char = (c == EOF) ? 0 : c;
+ fwrite(&c_char, sizeof c_char, 1, self->input_stream);
+ } while (c != EOF && c != '\n' && c != '\r');
+ fflush(self->input_stream);
+ }
+ fprintf(
+ stderr,
+ "<stdin>:%lu:%lu: lexing error: expected %s, found '%c'\n",
+ l.line_num + 1,
+ l.column_num + 1,
+ lex_state_error_string(self->state, self->inside_line),
+ c);
+ fprintf(stderr, "%5lu | %s\n", l.line_num + 1, self->input + l.line_offset);
+ fputs(" | ", stderr);
+ for (size_t i = 0; i < l.column_num; i++) {
+ if (self->input[l.line_offset + i] == '\t') {
+ fputc('\t', stderr);
+ } else {
+ fputc(' ', stderr);
+ }
+ }
+ fputs("^\n", stderr);
+ self->state = LS_ERROR;
+ return ERR;
+}
+
+static int lex_handle_next(struct lex *const self, const int c)
+{
+ switch (self->state) {
+ case LS_FREE:
+ if (is_alphabetic(c) || c == '_') {
+ self->inside_line = false;
+ self->tok_offset = self->cursor;
+ self->state = LS_IDENTIFIER;
+ } else if (c == '0') {
+ self->tok_offset = self->cursor;
+ self->state = LS_NUMOCTHEX;
+ } else if (is_dec(c)) {
+ self->tok_offset = self->cursor;
+ self->state = LS_NUMDEC;
+ } else if (c == '"') {
+ self->tok_offset = self->cursor;
+ self->state = LS_STRING;
+ } else if (c == ';') {
+ self->tok_offset = self->cursor;
+ self->state = LS_COMMENT_SEMICOLON;
+ } else if (c == '*') {
+ if (self->inside_line) {
+ return lex_yield_error(self, c);
+ }
+ self->tok_offset = self->cursor;
+ self->state = LS_COMMENT_ASTERISK;
+ } else if (c == ',') {
+ lex_yield_token(self, &(struct token){TT_COMMA, self->cursor, 1});
+ } else if (c == '.') {
+ lex_yield_token(self, &(struct token){TT_DOT, self->cursor, 1});
+ } else if (c == '(') {
+ lex_yield_token(self, &(struct token){TT_LPAREN, self->cursor, 1});
+ } else if (c == ')') {
+ lex_yield_token(self, &(struct token){TT_RPAREN, self->cursor, 1});
+ } else if (c == '+') {
+ lex_yield_token(self, &(struct token){TT_PLUS, self->cursor, 1});
+ } else if (c == '-') {
+ lex_yield_token(self, &(struct token){TT_MINUS, self->cursor, 1});
+ } else if (c == '=') {
+ lex_yield_token(self, &(struct token){TT_EQ, self->cursor, 1});
+ } else if (c == ':') {
+ lex_yield_token(self, &(struct token){TT_COLON, self->cursor, 1});
+ } else if (c == '%') {
+ lex_yield_token(self, &(struct token){TT_PERCENT, self->cursor, 1});
+ } else if (c == '#') {
+ lex_yield_token(self, &(struct token){TT_HASH, self->cursor, 1});
+ } else if (c == '\r') {
+ self->tok_offset = self->cursor;
+ self->state = LS_CR;
+ } else if (c == '\n') {
+ lex_yield_token(self, &(struct token){TT_NEWLINE, self->cursor, 1});
+ } else if (c == ' ') {
+ self->tok_offset = self->cursor;
+ self->state = LS_SPACE;
+ } else if (c == '\t') {
+ self->tok_offset = self->cursor;
+ self->state = LS_TAB;
+ } else if (c == EOF) {
+ self->state = LS_EOF;
+ } else {
+ return lex_yield_error(self, c);
+ }
+ break;
+ case LS_CR: // Accumulate CRLF into single token
+ {
+ const size_t size = c == '\n' ? 2 : 1; // 2 for CRLF, 1 for just CR
+ const struct token token = {TT_NEWLINE, self->tok_offset, size};
+ lex_yield_token(self, &token);
+ self->state = LS_FREE;
+ if (c != '\n') {
+ // It is just CR, handle this char in LS_FREE state then
+ return lex_handle_next(self, c);
+ }
+ }
+ break;
+ case LS_SPACE: // Accumulate multiple spaces into single token
+ if (c != ' ') {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_SPACE, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_TAB: // Accumulate multiple tabs into single token
+ if (c != '\t') {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_TAB, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_IDENTIFIER:
+ if (!is_alphanum(c) && c != '_') {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_IDENTIFIER, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_NUMOCTHEX:
+ if (c == 'x' || c == 'X') {
+ self->state = LS_NUMHEX;
+ } else if (is_oct(c)) {
+ self->state = LS_NUMOCT;
+ } else if (is_alphabetic(c) || c == '_') {
+ return lex_yield_error(self, c);
+ } else {
+ assert((self->cursor - self->tok_offset) == 1);
+ const struct token token = {TT_NUMDEC, self->tok_offset, 1};
+ lex_yield_token(self, &token);
+ // It was just zero, handle this char in LS_FREE state then
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_NUMOCT:
+ if (is_alphabetic(c) || c == '_') {
+ return lex_yield_error(self, c);
+ } else if (!is_oct(c)) {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_NUMOCT, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ // This token is finished, handle this char in LS_FREE state
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_NUMHEX:
+ if (is_hex(c)) {
+ // Keep calm
+ } else if (is_alphabetic(c) || c == '_') {
+ // Panik!
+ return lex_yield_error(self, c);
+ } else {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_NUMHEX, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ // This token is finished, handle this char in LS_FREE state
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_NUMDEC:
+ if (is_alphabetic(c) || c == '_') {
+ return lex_yield_error(self, c);
+ } else if (!is_dec(c)) {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_NUMDEC, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ // This token is finished, handle this char in LS_FREE state
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_STRING:
+ if (c == '\\') {
+ self->state = LS_STRING_ESC;
+ } else if (c == '"') {
+ const size_t length = self->cursor - self->tok_offset + 1;
+ const struct token token = {TT_STRING, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ // This token is finished
+ self->state = LS_FREE;
+ }
+ break;
+ case LS_STRING_ESC:
+ self->state = LS_STRING;
+ break;
+ case LS_COMMENT_ASTERISK:
+ if (c == '\r' || c == '\n') {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_COMMENT_ASTERISK, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ // This token is finished, handle this char in LS_FREE state
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_COMMENT_SEMICOLON:
+ if (c == '\r' || c == '\n') {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_COMMENT_SEMICOLON, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ // This token is finished, handle this char in LS_FREE state
+ self->state = LS_FREE;
+ return lex_handle_next(self, c);
+ }
+ break;
+ case LS_ERROR:
+ return ERR;
+ case LS_EOF:
+ assert(0);
+ }
+ return CONTINUE;
+}
+
+/** Advance lexer to produce new token.
+ * \returns EOF if end of file reached.
+ * \returns ERR if error encountered and lexing cannot continue.
+ * \returns OK has one or more new tokens parsed.
+ */
+static int lex_next(struct lex *const self, FILE *const stream)
+{
+ for (;; self->cursor++) {
+ const int c = fgetc(stream);
+ const char c_char = (c == EOF) ? 0 : c;
+ fwrite(&c_char, sizeof c_char, 1, self->input_stream);
+ const int ret = lex_handle_next(self, c);
+ if (OK == ret) {
+ return OK;
+ } else if (ERR == ret) {
+ // TODO handle errors
+ return ERR;
+ }
+ if (c == EOF) {
+ break;
+ }
+ }
+ return EOF;
+}
+
+/** Run lexer until the end of the input reached
+ * \returns OK if lexing finished successfully
+ * \returns ERR if error encountered and lexing cannot continue.
+ */
+static int lex_run(struct lex *const self, FILE *const stream)
+{
+ int res;
+ do {
+ res = lex_next(self, stream);
+ if (res == OK) {
+ res = 0;
+ } else if (res == ERR) {
+ return ERR;
+ }
+ } while (res != EOF);
+ fflush(self->input_stream);
+ fflush(self->tokbuf_stream);
return OK;
}
static void lex_destroy(struct lex *const self)
{
- (void) self;
+ fclose(self->input_stream);
+ free(self->input);
+ fclose(self->tokbuf_stream);
+ free(self->tokbuf);
}
-static int pars_init(struct pars *const self, struct lex *const lex)
+static int pars_init(struct pars *const self, const struct lex *const lex)
{
- (void) self;
- (void) lex;
+ *self = (struct pars){
+ .lex = lex,
+ .stmttab_stream = open_memstream(
+ (char **)&self->stmttab, &self->stmttab_size),
+ .symbuf_stream = open_memstream(&self->symbuf, &self->symbuf_size),
+ };
+ assert(self->stmttab_stream != NULL);
+ assert(self->symbuf_stream != NULL);
return OK;
}
-static int pars_run(struct pars *const self, FILE *const stream)
+/** Run parser until the end of the input reached
+ * \returns OK if parsing finished successfully
+ * \returns ERR if error encountered and parsing cannot continue.
+ */
+static int pars_run(struct pars *const self)
{
(void) self;
- (void) stream;
+ // TODO
return OK;
}
static void pars_destroy(struct pars *const self)
{
- (void) self;
+ fclose(self->stmttab_stream);
+ free(self->stmttab);
+ fclose(self->symbuf_stream);
+ free(self->symbuf);
}
-static int assem_init(struct assem *const self, struct pars *const pars)
+static int assem_init(struct assem *const self, const struct pars *const pars)
{
- (void) self;
- (void) pars;
+ *self = (struct assem){
+ .pars = pars,
+ };
return OK;
}
@@ -220,8 +782,12 @@ static int assem_resolve(struct assem *const self)
static int assem_emit(struct assem *const self, FILE *const stream)
{
- (void) self;
- (void) stream;
+ if (TRACE_LEX) {
+ const struct lex *const lex = self->pars->lex;
+ for (size_t i = 1; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) {
+ fprint_tok(lex->input, &lex->tokbuf[i], stream);
+ }
+ }
return OK;
}
@@ -240,19 +806,26 @@ int main(const int argc, char *const argv[])
if (OK != lex_init(&lex)) {
return EXIT_FAILURE;
}
- // Chain lexer and parser
+ // Tokenize assembly program text
+ if (OK != lex_run(&lex, stdin)) {
+ lex_destroy(&lex);
+ return EXIT_FAILURE;
+ }
+ // Parser needs final lexer state to access parsed tokens and input data
if (OK != pars_init(&pars, &lex)) {
lex_destroy(&lex);
return EXIT_FAILURE;
}
- // Parse assembly program text (tokenize + parse)
- if (OK != pars_run(&pars, stdin)) {
+ // Parse assembly program text
+ if (OK != pars_run(&pars)) {
pars_destroy(&pars);
lex_destroy(&lex);
return EXIT_FAILURE;
}
struct assem assem;
- // Allocate adn populate code table and metadata table from parsed data
+ // Allocate and populate code table and metadata table from parsed data.
+ // Assembler needs parser's and lexer's final state to access parsed
+ // structure, tokens and input.
if (OK != assem_init(&assem, &pars)) {
pars_destroy(&pars);
lex_destroy(&lex);