summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOxore <oxore@protonmail.com>2023-06-24 01:43:18 +0300
committerOxore <oxore@protonmail.com>2023-06-24 01:53:43 +0300
commite0029661122547edf6261bd6a56ed7fc090170ff (patch)
treef592ef9a1d191d0adfb867d70bf34bb43ee07018
parent7f37b944865967e3f41be925fcfee73adc6c7671 (diff)
Begin implementing parser, fix lexer
-rw-r--r--main.c196
1 files changed, 145 insertions, 51 deletions
diff --git a/main.c b/main.c
index b8b6fff..2b3ab2f 100644
--- a/main.c
+++ b/main.c
@@ -8,9 +8,10 @@
#include <assert.h>
#include <errno.h>
#include <stdbool.h>
+#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
-#include <stdint.h>
+#include <string.h>
#ifndef TRACE_LEX
#define TRACE_LEX 1
@@ -23,17 +24,18 @@
enum token_type {
TT_NONE = 0,
TT_SPACE,
- TT_TAB,
TT_NEWLINE,
+ TT_ESCAPE,
TT_DOT,
TT_COMMA,
TT_PLUS,
TT_MINUS,
+ TT_ASTERISK,
+ TT_SLASH,
TT_EQ,
TT_COLON,
TT_PERCENT,
TT_HASH,
- TT_ASTERISK,
TT_STRING,
TT_IDENTIFIER,
TT_NUMDEC,
@@ -60,7 +62,6 @@ enum lex_state {
LS_FREE = 0,
LS_CR,
LS_SPACE,
- LS_TAB,
LS_IDENTIFIER,
LS_NUMOCTHEX,
LS_NUMOCT,
@@ -86,7 +87,6 @@ struct lex {
enum lex_error error;
size_t cursor;
size_t tok_offset;
- size_t tokens_count;
bool inside_line;
// Input data buffer
FILE *input_stream;
@@ -111,7 +111,7 @@ enum stmt_kind {
SK_DIR_LINE,
};
-enum opcode {
+enum mnemonic {
OPCODE_NONE,
OPCODE_NOP,
};
@@ -148,7 +148,7 @@ struct arg_8_xn {
};
struct instruction {
- enum opcode opcode;
+ enum mnemonic mnemonic;
enum opsize opsize;
enum arg_kind arg1_kind, arg2_kind;
union {
@@ -303,18 +303,19 @@ static const char *tok_kind_to_string(const enum token_type type)
switch (type) {
case TT_NONE: return "NONE";
case TT_SPACE: return "SPACE";
- case TT_TAB: return "TAB";
case TT_NEWLINE: return "NEWLINE";
+ case TT_ESCAPE: return "ESCAPE";
case TT_DOT: return "DOT";
case TT_COMMA: return "COMMA";
case TT_PLUS: return "PLUS";
case TT_MINUS: return "MINUS";
+ case TT_ASTERISK: return "ASTERISK";
+ case TT_SLASH: return "SLASH";
case TT_EQ: return "EQ";
case TT_COLON: return "COLON";
case TT_PERCENT: return "PERCENT";
case TT_HASH: return "HASH";
case TT_STRING: return "STRING";
- case TT_ASTERISK: return "ASTERISK";
case TT_IDENTIFIER: return "IDENTIFIER";
case TT_NUMDEC: return "NUMDEC";
case TT_NUMOCT: return "NUMOCT";
@@ -369,9 +370,8 @@ static int fwrite_token(const struct token *const token, FILE *const stream)
static void lex_yield_token(struct lex *const self, const struct token *const token)
{
- self->inside_line = token->type != TT_NEWLINE;
+ self->inside_line = (token->type != TT_NEWLINE) && (token->type != TT_ESCAPE);
fwrite_token(token, self->tokbuf_stream);
- self->tokens_count++;
}
static const char *lex_state_error_string(
@@ -406,7 +406,6 @@ static const char *lex_state_error_string(
"or EOF";
case LS_CR:
case LS_SPACE:
- case LS_TAB:
case LS_IDENTIFIER:
case LS_STRING:
case LS_STRING_ESC:
@@ -433,11 +432,11 @@ static struct line_pos_info lex_get_line_pos_info(const struct lex *const self)
l.line_num++;
l.column_num = 0;
} else if (c == '\n') {
- cr = false;
- l.line_offset = i + 1;
if (!cr) {
l.line_num++;
}
+ cr = false;
+ l.line_offset = i + 1;
l.column_num = 0;
} else {
cr = false;
@@ -451,6 +450,7 @@ static int lex_yield_error(struct lex *const self, const int c)
{
fflush(self->input_stream);
const struct line_pos_info l = lex_get_line_pos_info(self);
+ const size_t cursor = self->cursor;
{
// Read out the rest of the line
int c;
@@ -461,13 +461,15 @@ static int lex_yield_error(struct lex *const self, const int c)
} while (c != EOF && c != '\n' && c != '\r');
fflush(self->input_stream);
}
+ const unsigned char c_char = (c == EOF) ? 0 : c;
fprintf(
stderr,
- "<stdin>:%lu:%lu: lexing error: expected %s, found '%c'\n",
+ "<stdin>:%lu:%lu: lexing error: expected %s, found '",
l.line_num + 1,
l.column_num + 1,
- lex_state_error_string(self->state, self->inside_line),
- c);
+ lex_state_error_string(self->state, self->inside_line));
+ fputs(g_escape_table[c_char], stderr);
+ fputs("'\n", stderr);
fprintf(stderr, "%5lu | %s\n", l.line_num + 1, self->input + l.line_offset);
fputs(" | ", stderr);
for (size_t i = 0; i < l.column_num; i++) {
@@ -478,6 +480,7 @@ static int lex_yield_error(struct lex *const self, const int c)
}
}
fputs("^\n", stderr);
+ fprintf(stderr, "<stdin>: %lu bytes parsed\n", cursor);
self->state = LS_ERROR;
return ERR;
}
@@ -487,7 +490,6 @@ static int lex_handle_next(struct lex *const self, const int c)
switch (self->state) {
case LS_FREE:
if (is_alphabetic(c) || c == '_') {
- self->inside_line = false;
self->tok_offset = self->cursor;
self->state = LS_IDENTIFIER;
} else if (c == '0') {
@@ -502,12 +504,6 @@ static int lex_handle_next(struct lex *const self, const int c)
} else if (c == ';') {
self->tok_offset = self->cursor;
self->state = LS_COMMENT_SEMICOLON;
- } else if (c == '*') {
- if (self->inside_line) {
- return lex_yield_error(self, c);
- }
- self->tok_offset = self->cursor;
- self->state = LS_COMMENT_ASTERISK;
} else if (c == ',') {
lex_yield_token(self, &(struct token){TT_COMMA, self->cursor, 1});
} else if (c == '.') {
@@ -520,6 +516,15 @@ static int lex_handle_next(struct lex *const self, const int c)
lex_yield_token(self, &(struct token){TT_PLUS, self->cursor, 1});
} else if (c == '-') {
lex_yield_token(self, &(struct token){TT_MINUS, self->cursor, 1});
+ } else if (c == '*') {
+ if (self->inside_line) {
+ lex_yield_token(
+ self, &(struct token){TT_ASTERISK, self->cursor, 1});
+ }
+ self->tok_offset = self->cursor;
+ self->state = LS_COMMENT_ASTERISK;
+ } else if (c == '/') {
+ lex_yield_token(self, &(struct token){TT_SLASH, self->cursor, 1});
} else if (c == '=') {
lex_yield_token(self, &(struct token){TT_EQ, self->cursor, 1});
} else if (c == ':') {
@@ -533,14 +538,15 @@ static int lex_handle_next(struct lex *const self, const int c)
self->state = LS_CR;
} else if (c == '\n') {
lex_yield_token(self, &(struct token){TT_NEWLINE, self->cursor, 1});
- } else if (c == ' ') {
+ } else if (c == '\\') {
+ lex_yield_token(self, &(struct token){TT_ESCAPE, self->cursor, 1});
+ } else if (c == ' ' || c == '\t') {
self->tok_offset = self->cursor;
self->state = LS_SPACE;
- } else if (c == '\t') {
- self->tok_offset = self->cursor;
- self->state = LS_TAB;
} else if (c == EOF) {
self->state = LS_EOF;
+ } else if (c == '\x1a') {
+ // Ignore "End of file" character
} else {
return lex_yield_error(self, c);
}
@@ -558,26 +564,20 @@ static int lex_handle_next(struct lex *const self, const int c)
}
break;
case LS_SPACE: // Accumulate multiple spaces into single token
- if (c != ' ') {
- const size_t length = self->cursor - self->tok_offset;
- const struct token token = {TT_SPACE, self->tok_offset, length};
- lex_yield_token(self, &token);
- self->state = LS_FREE;
- return lex_handle_next(self, c);
- }
- break;
- case LS_TAB: // Accumulate multiple tabs into single token
- if (c != '\t') {
- const size_t length = self->cursor - self->tok_offset;
- const struct token token = {TT_TAB, self->tok_offset, length};
- lex_yield_token(self, &token);
+ if (c != ' ' && c != '\t') {
+ // Only spaces and tabs at the beginning of the line are significant
+ if (!self->inside_line) {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_SPACE, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ }
self->state = LS_FREE;
return lex_handle_next(self, c);
}
break;
case LS_IDENTIFIER:
if (!is_alphanum(c) && c != '_') {
- const size_t length = self->cursor - self->tok_offset;
+ const size_t length = self->cursor - self->tok_offset;
const struct token token = {TT_IDENTIFIER, self->tok_offset, length};
lex_yield_token(self, &token);
self->state = LS_FREE;
@@ -604,7 +604,7 @@ static int lex_handle_next(struct lex *const self, const int c)
if (is_alphabetic(c) || c == '_') {
return lex_yield_error(self, c);
} else if (!is_oct(c)) {
- const size_t length = self->cursor - self->tok_offset;
+ const size_t length = self->cursor - self->tok_offset;
const struct token token = {TT_NUMOCT, self->tok_offset, length};
lex_yield_token(self, &token);
// This token is finished, handle this char in LS_FREE state
@@ -619,7 +619,7 @@ static int lex_handle_next(struct lex *const self, const int c)
// Panik!
return lex_yield_error(self, c);
} else {
- const size_t length = self->cursor - self->tok_offset;
+ const size_t length = self->cursor - self->tok_offset;
const struct token token = {TT_NUMHEX, self->tok_offset, length};
lex_yield_token(self, &token);
// This token is finished, handle this char in LS_FREE state
@@ -631,7 +631,7 @@ static int lex_handle_next(struct lex *const self, const int c)
if (is_alphabetic(c) || c == '_') {
return lex_yield_error(self, c);
} else if (!is_dec(c)) {
- const size_t length = self->cursor - self->tok_offset;
+ const size_t length = self->cursor - self->tok_offset;
const struct token token = {TT_NUMDEC, self->tok_offset, length};
lex_yield_token(self, &token);
// This token is finished, handle this char in LS_FREE state
@@ -643,7 +643,7 @@ static int lex_handle_next(struct lex *const self, const int c)
if (c == '\\') {
self->state = LS_STRING_ESC;
} else if (c == '"') {
- const size_t length = self->cursor - self->tok_offset + 1;
+ const size_t length = self->cursor - self->tok_offset + 1;
const struct token token = {TT_STRING, self->tok_offset, length};
lex_yield_token(self, &token);
// This token is finished
@@ -655,7 +655,7 @@ static int lex_handle_next(struct lex *const self, const int c)
break;
case LS_COMMENT_ASTERISK:
if (c == '\r' || c == '\n') {
- const size_t length = self->cursor - self->tok_offset;
+ const size_t length = self->cursor - self->tok_offset;
const struct token token = {TT_COMMENT_ASTERISK, self->tok_offset, length};
lex_yield_token(self, &token);
// This token is finished, handle this char in LS_FREE state
@@ -665,7 +665,7 @@ static int lex_handle_next(struct lex *const self, const int c)
break;
case LS_COMMENT_SEMICOLON:
if (c == '\r' || c == '\n') {
- const size_t length = self->cursor - self->tok_offset;
+ const size_t length = self->cursor - self->tok_offset;
const struct token token = {TT_COMMENT_SEMICOLON, self->tok_offset, length};
lex_yield_token(self, &token);
// This token is finished, handle this char in LS_FREE state
@@ -747,14 +747,108 @@ static int pars_init(struct pars *const self, const struct lex *const lex)
return OK;
}
+static int pars_yield_error(struct pars *const self)
+{
+ (void) self;
+ // TODO
+ return ERR;
+}
+
+static int pars_parse_label(struct pars *const self)
+{
+ const struct token label = self->lex->tokbuf[self->cur_tok_id++];
+ if (label.type != TT_IDENTIFIER) {
+ return pars_yield_error(self);
+ }
+ // TODO
+ return OK;
+}
+
+static int pars_parse_instr(struct pars *const self, const struct token mnemo)
+{
+ (void) self;
+ (void) mnemo;
+ // TODO
+ return OK;
+}
+
+static int pars_parse_direc(struct pars *const self)
+{
+ const char *input = self->lex->input;
+ const struct token direc = self->lex->tokbuf[self->cur_tok_id++];
+ if (direc.type != TT_IDENTIFIER) {
+ return pars_yield_error(self);
+ }
+ if (0 == strcmp(input + direc.offset, "def")) {
+ } else if (0 == strcmp(input + direc.offset, "opt")) {
+ } else if (0 == strcmp(input + direc.offset, "file")) {
+ } else if (0 == strcmp(input + direc.offset, "text")) {
+ } else if (0 == strcmp(input + direc.offset, "align")) {
+ } else if (0 == strcmp(input + direc.offset, "globl")) {
+ } else if (0 == strcmp(input + direc.offset, "ln")) {
+ } else if (0 == strcmp(input + direc.offset, "long")) {
+ } else if (0 == strcmp(input + direc.offset, "word")) {
+ } else if (0 == strcmp(input + direc.offset, "byte")) {
+ } else if (0 == strcmp(input + direc.offset, "bin")) {
+ }
+ // TODO
+ return OK;
+}
+
+static int pars_parse_instr_or_direc(struct pars *const self)
+{
+ struct token token;
+ assert(token.type != TT_SPACE);
+ if (token.type == TT_NEWLINE) {
+ return OK;
+ } else if (token.type == TT_DOT) {
+ return pars_parse_direc(self);
+ } else if (token.type == TT_IDENTIFIER) {
+ return pars_parse_instr(self, token);
+ } else {
+ return pars_yield_error(self);
+ }
+ return OK;
+}
+
+static int pars_parse_statement(struct pars *const self)
+{
+ const size_t tokens_count = self->lex->tokbuf_size /
+ (sizeof *self->lex->tokbuf);
+ const struct token token = self->lex->tokbuf[self->cur_tok_id++];
+ const bool is_comment = token.type == TT_COMMENT_ASTERISK ||
+ token.type == TT_COMMENT_SEMICOLON;
+ if (token.type == TT_SPACE) {
+ return pars_parse_instr_or_direc(self);
+ } else if (token.type == TT_IDENTIFIER) {
+ return pars_parse_label(self);
+ } else if (!is_comment) {
+ if (self->cur_tok_id < tokens_count) {
+ const struct token nl = self->lex->tokbuf[self->cur_tok_id++];
+ assert(nl.type == TT_NEWLINE);
+ }
+ return OK;
+ } else if (token.type == TT_NEWLINE) {
+ return OK;
+ }
+ return pars_yield_error(self);
+}
+
/** Run parser until the end of the input reached
* \returns OK if parsing finished successfully
* \returns ERR if error encountered and parsing cannot continue.
*/
static int pars_run(struct pars *const self)
{
- (void) self;
- // TODO
+ return OK;
+ const size_t tokens_count = self->lex->tokbuf_size /
+ (sizeof *self->lex->tokbuf);
+ do {
+ const int ret = pars_parse_statement(self);
+ if (ret != OK) {
+ return ret;
+ }
+ } while (self->cur_tok_id < tokens_count);
return OK;
}
@@ -784,7 +878,7 @@ static int assem_emit(struct assem *const self, FILE *const stream)
{
if (TRACE_LEX) {
const struct lex *const lex = self->pars->lex;
- for (size_t i = 1; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) {
+ for (size_t i = 0; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) {
fprint_tok(lex->input, &lex->tokbuf[i], stream);
}
}