diff options
author | Oxore <oxore@protonmail.com> | 2023-06-24 01:43:18 +0300 |
---|---|---|
committer | Oxore <oxore@protonmail.com> | 2023-06-24 01:53:43 +0300 |
commit | e0029661122547edf6261bd6a56ed7fc090170ff (patch) | |
tree | f592ef9a1d191d0adfb867d70bf34bb43ee07018 | |
parent | 7f37b944865967e3f41be925fcfee73adc6c7671 (diff) |
Begin implementing parser, fix lexer
-rw-r--r-- | main.c | 196 |
1 files changed, 145 insertions, 51 deletions
@@ -8,9 +8,10 @@ #include <assert.h> #include <errno.h> #include <stdbool.h> +#include <stdint.h> #include <stdio.h> #include <stdlib.h> -#include <stdint.h> +#include <string.h> #ifndef TRACE_LEX #define TRACE_LEX 1 @@ -23,17 +24,18 @@ enum token_type { TT_NONE = 0, TT_SPACE, - TT_TAB, TT_NEWLINE, + TT_ESCAPE, TT_DOT, TT_COMMA, TT_PLUS, TT_MINUS, + TT_ASTERISK, + TT_SLASH, TT_EQ, TT_COLON, TT_PERCENT, TT_HASH, - TT_ASTERISK, TT_STRING, TT_IDENTIFIER, TT_NUMDEC, @@ -60,7 +62,6 @@ enum lex_state { LS_FREE = 0, LS_CR, LS_SPACE, - LS_TAB, LS_IDENTIFIER, LS_NUMOCTHEX, LS_NUMOCT, @@ -86,7 +87,6 @@ struct lex { enum lex_error error; size_t cursor; size_t tok_offset; - size_t tokens_count; bool inside_line; // Input data buffer FILE *input_stream; @@ -111,7 +111,7 @@ enum stmt_kind { SK_DIR_LINE, }; -enum opcode { +enum mnemonic { OPCODE_NONE, OPCODE_NOP, }; @@ -148,7 +148,7 @@ struct arg_8_xn { }; struct instruction { - enum opcode opcode; + enum mnemonic mnemonic; enum opsize opsize; enum arg_kind arg1_kind, arg2_kind; union { @@ -303,18 +303,19 @@ static const char *tok_kind_to_string(const enum token_type type) switch (type) { case TT_NONE: return "NONE"; case TT_SPACE: return "SPACE"; - case TT_TAB: return "TAB"; case TT_NEWLINE: return "NEWLINE"; + case TT_ESCAPE: return "ESCAPE"; case TT_DOT: return "DOT"; case TT_COMMA: return "COMMA"; case TT_PLUS: return "PLUS"; case TT_MINUS: return "MINUS"; + case TT_ASTERISK: return "ASTERISK"; + case TT_SLASH: return "SLASH"; case TT_EQ: return "EQ"; case TT_COLON: return "COLON"; case TT_PERCENT: return "PERCENT"; case TT_HASH: return "HASH"; case TT_STRING: return "STRING"; - case TT_ASTERISK: return "ASTERISK"; case TT_IDENTIFIER: return "IDENTIFIER"; case TT_NUMDEC: return "NUMDEC"; case TT_NUMOCT: return "NUMOCT"; @@ -369,9 +370,8 @@ static int fwrite_token(const struct token *const token, FILE *const stream) static void lex_yield_token(struct lex *const self, const struct token *const token) { - self->inside_line = token->type != TT_NEWLINE; + self->inside_line = (token->type != TT_NEWLINE) && (token->type != TT_ESCAPE); fwrite_token(token, self->tokbuf_stream); - self->tokens_count++; } static const char *lex_state_error_string( @@ -406,7 +406,6 @@ static const char *lex_state_error_string( "or EOF"; case LS_CR: case LS_SPACE: - case LS_TAB: case LS_IDENTIFIER: case LS_STRING: case LS_STRING_ESC: @@ -433,11 +432,11 @@ static struct line_pos_info lex_get_line_pos_info(const struct lex *const self) l.line_num++; l.column_num = 0; } else if (c == '\n') { - cr = false; - l.line_offset = i + 1; if (!cr) { l.line_num++; } + cr = false; + l.line_offset = i + 1; l.column_num = 0; } else { cr = false; @@ -451,6 +450,7 @@ static int lex_yield_error(struct lex *const self, const int c) { fflush(self->input_stream); const struct line_pos_info l = lex_get_line_pos_info(self); + const size_t cursor = self->cursor; { // Read out the rest of the line int c; @@ -461,13 +461,15 @@ static int lex_yield_error(struct lex *const self, const int c) } while (c != EOF && c != '\n' && c != '\r'); fflush(self->input_stream); } + const unsigned char c_char = (c == EOF) ? 0 : c; fprintf( stderr, - "<stdin>:%lu:%lu: lexing error: expected %s, found '%c'\n", + "<stdin>:%lu:%lu: lexing error: expected %s, found '", l.line_num + 1, l.column_num + 1, - lex_state_error_string(self->state, self->inside_line), - c); + lex_state_error_string(self->state, self->inside_line)); + fputs(g_escape_table[c_char], stderr); + fputs("'\n", stderr); fprintf(stderr, "%5lu | %s\n", l.line_num + 1, self->input + l.line_offset); fputs(" | ", stderr); for (size_t i = 0; i < l.column_num; i++) { @@ -478,6 +480,7 @@ static int lex_yield_error(struct lex *const self, const int c) } } fputs("^\n", stderr); + fprintf(stderr, "<stdin>: %lu bytes parsed\n", cursor); self->state = LS_ERROR; return ERR; } @@ -487,7 +490,6 @@ static int lex_handle_next(struct lex *const self, const int c) switch (self->state) { case LS_FREE: if (is_alphabetic(c) || c == '_') { - self->inside_line = false; self->tok_offset = self->cursor; self->state = LS_IDENTIFIER; } else if (c == '0') { @@ -502,12 +504,6 @@ static int lex_handle_next(struct lex *const self, const int c) } else if (c == ';') { self->tok_offset = self->cursor; self->state = LS_COMMENT_SEMICOLON; - } else if (c == '*') { - if (self->inside_line) { - return lex_yield_error(self, c); - } - self->tok_offset = self->cursor; - self->state = LS_COMMENT_ASTERISK; } else if (c == ',') { lex_yield_token(self, &(struct token){TT_COMMA, self->cursor, 1}); } else if (c == '.') { @@ -520,6 +516,15 @@ static int lex_handle_next(struct lex *const self, const int c) lex_yield_token(self, &(struct token){TT_PLUS, self->cursor, 1}); } else if (c == '-') { lex_yield_token(self, &(struct token){TT_MINUS, self->cursor, 1}); + } else if (c == '*') { + if (self->inside_line) { + lex_yield_token( + self, &(struct token){TT_ASTERISK, self->cursor, 1}); + } + self->tok_offset = self->cursor; + self->state = LS_COMMENT_ASTERISK; + } else if (c == '/') { + lex_yield_token(self, &(struct token){TT_SLASH, self->cursor, 1}); } else if (c == '=') { lex_yield_token(self, &(struct token){TT_EQ, self->cursor, 1}); } else if (c == ':') { @@ -533,14 +538,15 @@ static int lex_handle_next(struct lex *const self, const int c) self->state = LS_CR; } else if (c == '\n') { lex_yield_token(self, &(struct token){TT_NEWLINE, self->cursor, 1}); - } else if (c == ' ') { + } else if (c == '\\') { + lex_yield_token(self, &(struct token){TT_ESCAPE, self->cursor, 1}); + } else if (c == ' ' || c == '\t') { self->tok_offset = self->cursor; self->state = LS_SPACE; - } else if (c == '\t') { - self->tok_offset = self->cursor; - self->state = LS_TAB; } else if (c == EOF) { self->state = LS_EOF; + } else if (c == '\x1a') { + // Ignore "End of file" character } else { return lex_yield_error(self, c); } @@ -558,26 +564,20 @@ static int lex_handle_next(struct lex *const self, const int c) } break; case LS_SPACE: // Accumulate multiple spaces into single token - if (c != ' ') { - const size_t length = self->cursor - self->tok_offset; - const struct token token = {TT_SPACE, self->tok_offset, length}; - lex_yield_token(self, &token); - self->state = LS_FREE; - return lex_handle_next(self, c); - } - break; - case LS_TAB: // Accumulate multiple tabs into single token - if (c != '\t') { - const size_t length = self->cursor - self->tok_offset; - const struct token token = {TT_TAB, self->tok_offset, length}; - lex_yield_token(self, &token); + if (c != ' ' && c != '\t') { + // Only spaces and tabs at the beginning of the line are significant + if (!self->inside_line) { + const size_t length = self->cursor - self->tok_offset; + const struct token token = {TT_SPACE, self->tok_offset, length}; + lex_yield_token(self, &token); + } self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_IDENTIFIER: if (!is_alphanum(c) && c != '_') { - const size_t length = self->cursor - self->tok_offset; + const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_IDENTIFIER, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; @@ -604,7 +604,7 @@ static int lex_handle_next(struct lex *const self, const int c) if (is_alphabetic(c) || c == '_') { return lex_yield_error(self, c); } else if (!is_oct(c)) { - const size_t length = self->cursor - self->tok_offset; + const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_NUMOCT, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state @@ -619,7 +619,7 @@ static int lex_handle_next(struct lex *const self, const int c) // Panik! return lex_yield_error(self, c); } else { - const size_t length = self->cursor - self->tok_offset; + const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_NUMHEX, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state @@ -631,7 +631,7 @@ static int lex_handle_next(struct lex *const self, const int c) if (is_alphabetic(c) || c == '_') { return lex_yield_error(self, c); } else if (!is_dec(c)) { - const size_t length = self->cursor - self->tok_offset; + const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_NUMDEC, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state @@ -643,7 +643,7 @@ static int lex_handle_next(struct lex *const self, const int c) if (c == '\\') { self->state = LS_STRING_ESC; } else if (c == '"') { - const size_t length = self->cursor - self->tok_offset + 1; + const size_t length = self->cursor - self->tok_offset + 1; const struct token token = {TT_STRING, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished @@ -655,7 +655,7 @@ static int lex_handle_next(struct lex *const self, const int c) break; case LS_COMMENT_ASTERISK: if (c == '\r' || c == '\n') { - const size_t length = self->cursor - self->tok_offset; + const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_COMMENT_ASTERISK, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state @@ -665,7 +665,7 @@ static int lex_handle_next(struct lex *const self, const int c) break; case LS_COMMENT_SEMICOLON: if (c == '\r' || c == '\n') { - const size_t length = self->cursor - self->tok_offset; + const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_COMMENT_SEMICOLON, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state @@ -747,14 +747,108 @@ static int pars_init(struct pars *const self, const struct lex *const lex) return OK; } +static int pars_yield_error(struct pars *const self) +{ + (void) self; + // TODO + return ERR; +} + +static int pars_parse_label(struct pars *const self) +{ + const struct token label = self->lex->tokbuf[self->cur_tok_id++]; + if (label.type != TT_IDENTIFIER) { + return pars_yield_error(self); + } + // TODO + return OK; +} + +static int pars_parse_instr(struct pars *const self, const struct token mnemo) +{ + (void) self; + (void) mnemo; + // TODO + return OK; +} + +static int pars_parse_direc(struct pars *const self) +{ + const char *input = self->lex->input; + const struct token direc = self->lex->tokbuf[self->cur_tok_id++]; + if (direc.type != TT_IDENTIFIER) { + return pars_yield_error(self); + } + if (0 == strcmp(input + direc.offset, "def")) { + } else if (0 == strcmp(input + direc.offset, "opt")) { + } else if (0 == strcmp(input + direc.offset, "file")) { + } else if (0 == strcmp(input + direc.offset, "text")) { + } else if (0 == strcmp(input + direc.offset, "align")) { + } else if (0 == strcmp(input + direc.offset, "globl")) { + } else if (0 == strcmp(input + direc.offset, "ln")) { + } else if (0 == strcmp(input + direc.offset, "long")) { + } else if (0 == strcmp(input + direc.offset, "word")) { + } else if (0 == strcmp(input + direc.offset, "byte")) { + } else if (0 == strcmp(input + direc.offset, "bin")) { + } + // TODO + return OK; +} + +static int pars_parse_instr_or_direc(struct pars *const self) +{ + struct token token; + assert(token.type != TT_SPACE); + if (token.type == TT_NEWLINE) { + return OK; + } else if (token.type == TT_DOT) { + return pars_parse_direc(self); + } else if (token.type == TT_IDENTIFIER) { + return pars_parse_instr(self, token); + } else { + return pars_yield_error(self); + } + return OK; +} + +static int pars_parse_statement(struct pars *const self) +{ + const size_t tokens_count = self->lex->tokbuf_size / + (sizeof *self->lex->tokbuf); + const struct token token = self->lex->tokbuf[self->cur_tok_id++]; + const bool is_comment = token.type == TT_COMMENT_ASTERISK || + token.type == TT_COMMENT_SEMICOLON; + if (token.type == TT_SPACE) { + return pars_parse_instr_or_direc(self); + } else if (token.type == TT_IDENTIFIER) { + return pars_parse_label(self); + } else if (!is_comment) { + if (self->cur_tok_id < tokens_count) { + const struct token nl = self->lex->tokbuf[self->cur_tok_id++]; + assert(nl.type == TT_NEWLINE); + } + return OK; + } else if (token.type == TT_NEWLINE) { + return OK; + } + return pars_yield_error(self); +} + /** Run parser until the end of the input reached * \returns OK if parsing finished successfully * \returns ERR if error encountered and parsing cannot continue. */ static int pars_run(struct pars *const self) { - (void) self; - // TODO + return OK; + const size_t tokens_count = self->lex->tokbuf_size / + (sizeof *self->lex->tokbuf); + do { + const int ret = pars_parse_statement(self); + if (ret != OK) { + return ret; + } + } while (self->cur_tok_id < tokens_count); return OK; } @@ -784,7 +878,7 @@ static int assem_emit(struct assem *const self, FILE *const stream) { if (TRACE_LEX) { const struct lex *const lex = self->pars->lex; - for (size_t i = 1; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) { + for (size_t i = 0; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) { fprint_tok(lex->input, &lex->tokbuf[i], stream); } } |