diff options
author | Oxore <oxore@protonmail.com> | 2023-06-25 12:51:39 +0300 |
---|---|---|
committer | Oxore <oxore@protonmail.com> | 2023-06-25 12:52:39 +0300 |
commit | f4666450e21bf4558ace3c93eb474f062a0fda4b (patch) | |
tree | d41122c44b77e0104b23b69dfa3b77fe3f3f7dcc | |
parent | 19812eab123d347435929dd4f40649b467d3b457 (diff) |
Impl basics of instruction parsing
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | main.c | 1024 |
2 files changed, 918 insertions, 108 deletions
@@ -2,7 +2,7 @@ WARNFLAGS = -Wall -Wextra -pedantic -Wlogical-op INCLUDES = lib -_FLAGS = -O2 -fsanitize=address +_FLAGS = -O2 -fsanitize=unreachable -fsanitize=address _CFLAGS = $(CFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(_FLAGS) -pipe -g _CXXFLAGS = $(CXXFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(_FLAGS) -pipe -g LDSCRIPTS = @@ -6,6 +6,7 @@ */ #include <assert.h> +#include <ctype.h> #include <errno.h> #include <stdbool.h> #include <stdint.h> @@ -13,8 +14,18 @@ #include <stdlib.h> #include <string.h> -#ifndef TRACE_LEX -#define TRACE_LEX 1 +#ifndef TRACE_LEXER +#define TRACE_LEXER 0 +#endif + +#ifndef TRACE_PARSER +#define TRACE_PARSER 1 +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define UNREACHABLE __builtin_unreachable +#else +#define UNREACHABLE() #endif #define ERR 0 @@ -39,7 +50,6 @@ enum token_type { TT_RIGHT_SHIFT, TT_HASH, TT_BANG, - TT_DOLLAR, TT_TILDE, TT_AMPERSAND, TT_PIPE, @@ -112,22 +122,132 @@ struct lex { size_t tokbuf_size; }; -enum stmt_kind { - SK_NONE = 0, - SK_LABEL, - SK_INSTRUCTION, - SK_TEXT, - SK_DIR_FILE, - SK_DIR_TEXT, - SK_DIR_ALIGN, - SK_DIR_DEF_ENDEF, - SK_DIR_GLOBL, - SK_DIR_LINE, +enum stmt_type { + ST_NONE = 0, + ST_LABEL, + ST_INSTRUCTION, + ST_ASSIGNMENT, + ST_COMMENT, + ST_DIRECTIVE, }; enum mnemonic { - OPCODE_NONE, - OPCODE_NOP, + MN_NONE = 0, + MN_ABCD, + MN_ADD, + MN_ADDA, + MN_ADDI, + MN_ADDQ, + MN_ADDX, + MN_AND, + MN_ANDI, + MN_ASL, + MN_BRA, + MN_BSR, + MN_BCC, + MN_BCS, + MN_BEQ, + MN_BGE, + MN_BGT, + MN_BHI, + MN_BLE, + MN_BLS, + MN_BLT, + MN_BMT, + MN_BNE, + MN_BPL, + MN_BVC, + MN_BVS, + MN_BCHG, + MN_BCLR, + MN_BSET, + MN_CHK, + MN_CLR, + MN_CMP, + MN_CMPA, + MN_CMPI, + MN_CMPM, + MN_DBT, + MN_DBF, + MN_DBCC, + MN_DBCS, + MN_DBEQ, + MN_DBGE, + MN_DBGT, + MN_DBHI, + MN_DBLE, + MN_DBLS, + MN_DBLT, + MN_DBMT, + MN_DBNE, + MN_DBPL, + MN_DBVC, + MN_DBVS, + MN_DIVU, + MN_DIVS, + MN_EORI, + MN_EXG, + MN_EXT, + MN_ILLEGAL, + MN_JMP, + MN_JSR, + MN_LEA, + MN_LINK, + MN_LSL, + MN_LSR, + MN_MOVE, + MN_MOVEA, + MN_MOVEM, + MN_MOVEP, + MN_MOVEQ, + MN_MULS, + MN_MULU, + MN_NBCD, + MN_NEG, + MN_NEGX, + MN_NOP, + MN_NOT, + MN_OR, + MN_ORI, + MN_PEA, + MN_RESET, + MN_ROL, + MN_ROR, + MN_ROXL, + MN_ROXR, + MN_RTE, + MN_RTR, + MN_RTS, + MN_SBCD, + MN_ST, + MN_SF, + MN_SCC, + MN_SCS, + MN_SEQ, + MN_SGE, + MN_SGT, + MN_SHI, + MN_SLE, + MN_SLS, + MN_SLT, + MN_SMT, + MN_SNE, + MN_SPL, + MN_SVC, + MN_SVS, + MN_STOP, + MN_SUB, + MN_SUBA, + MN_SUBI, + MN_SUBQ, + MN_SUBX, + MN_SWAP, + MN_TAS, + MN_TRAP, + MN_TRAPV, + MN_TST, + MN_UNLK, + MNEMONICS_COUNT, }; enum opsize { @@ -138,7 +258,7 @@ enum opsize { OPSIZE_L, }; -enum arg_kind { +enum arg_type { ARG_NONE = 0, ARG_DN, ARG_AN, @@ -146,29 +266,57 @@ enum arg_kind { ARG_AN_ADDR_INCR, ARG_AN_ADDR_DECR, ARG_AN_ADDR_16, - ARG_AN_ADDR_8_XN, + ARG_AN_ADDR_8_XI, ARG_ADDR_WORD, ARG_ADDR_LONG, ARG_ADDR_UNSPEC, ARG_PC_ADDR_16, ARG_PC_ADDR_8_XN, ARG_IMMEDIATE, + ARG_SR, + ARG_CCR, + ARG_USP, + ARG_EXPR, }; -struct arg_8_xn { - int8_t val; +enum args_count { + ARGS_COUNT_UNKNOWN = 0, + ARGS_COUNT_0, + ARGS_COUNT_0_1, + ARGS_COUNT_0_1_2, + ARGS_COUNT_0_2, + ARGS_COUNT_1, + ARGS_COUNT_1_2, + ARGS_COUNT_2, +}; + +struct arg_16 { + int16_t d; + int8_t an; +}; + +struct arg_8 { + int8_t d; int8_t an; int8_t xi; }; +union arg_contents { + int32_t imm, addr, xn; + struct arg_16 arg_16; // For (d16,An) and (d16,PC) + struct arg_8 arg_8; // For (d8,An,Xi) and (d8,PC,Xn) +}; + +struct arg { + enum arg_type arg_type; + union arg_contents arg_contents; + size_t first_token, num_tokens; // Expression tokens span, may be NULL +}; + struct instruction { enum mnemonic mnemonic; enum opsize opsize; - enum arg_kind arg1_kind, arg2_kind; - union { - int32_t imm, addr; - struct arg_8_xn arg_8_xn; // For (d,An,Xi) and (d,PC,Xn) - } arg1, arg2; + struct arg arg1, arg2; }; struct def_endef { @@ -180,13 +328,14 @@ struct def_endef { }; struct stmt { - enum stmt_kind type; + enum stmt_type type; union { struct instruction instruction; int32_t align; size_t globl_sym_id; size_t file_sym_id; }; + size_t label_token; size_t first_token, num_tokens; // Statement tokens span, may be NULL size_t comment_token; }; @@ -257,6 +406,127 @@ const char *const g_escape_table[256] = { "\\xfd", "\\xfe", }; +struct mnemonic_meta { + const char *str; + enum args_count args_count; +} g_mnemmonics[MNEMONICS_COUNT] = { + { "none", ARGS_COUNT_0 }, + { "abcd", ARGS_COUNT_2 }, + { "add", ARGS_COUNT_2 }, + { "adda", ARGS_COUNT_2 }, + { "addi", ARGS_COUNT_2 }, + { "addq", ARGS_COUNT_2 }, + { "addx", ARGS_COUNT_2 }, + { "and", ARGS_COUNT_2 }, + { "andi", ARGS_COUNT_2 }, + { "asl", ARGS_COUNT_1_2 }, + { "bra", ARGS_COUNT_1 }, + { "bsr", ARGS_COUNT_1 }, + { "bcc", ARGS_COUNT_1 }, + { "bcs", ARGS_COUNT_1 }, + { "beq", ARGS_COUNT_1 }, + { "bge", ARGS_COUNT_1 }, + { "bgt", ARGS_COUNT_1 }, + { "bhi", ARGS_COUNT_1 }, + { "ble", ARGS_COUNT_1 }, + { "bls", ARGS_COUNT_1 }, + { "blt", ARGS_COUNT_1 }, + { "bmt", ARGS_COUNT_1 }, + { "bne", ARGS_COUNT_1 }, + { "bpl", ARGS_COUNT_1 }, + { "bvc", ARGS_COUNT_1 }, + { "bvs", ARGS_COUNT_1 }, + { "bchg", ARGS_COUNT_2 }, + { "bclr", ARGS_COUNT_2 }, + { "bset", ARGS_COUNT_2 }, + { "chk", ARGS_COUNT_2 }, + { "clr", ARGS_COUNT_1 }, + { "cmp", ARGS_COUNT_2 }, + { "cmpa", ARGS_COUNT_2 }, + { "cmpi", ARGS_COUNT_2 }, + { "cmpm", ARGS_COUNT_2 }, + { "dbt", ARGS_COUNT_2 }, + { "dbf", ARGS_COUNT_2 }, + { "dbcc", ARGS_COUNT_2 }, + { "dbcs", ARGS_COUNT_2 }, + { "dbeq", ARGS_COUNT_2 }, + { "dbge", ARGS_COUNT_2 }, + { "dbgt", ARGS_COUNT_2 }, + { "dbhi", ARGS_COUNT_2 }, + { "dble", ARGS_COUNT_2 }, + { "dbls", ARGS_COUNT_2 }, + { "dblt", ARGS_COUNT_2 }, + { "dbmt", ARGS_COUNT_2 }, + { "dbne", ARGS_COUNT_2 }, + { "dbpl", ARGS_COUNT_2 }, + { "dbvc", ARGS_COUNT_2 }, + { "dbvs", ARGS_COUNT_2 }, + { "divu", ARGS_COUNT_2 }, + { "divs", ARGS_COUNT_2 }, + { "eori", ARGS_COUNT_2 }, + { "exg", ARGS_COUNT_2 }, + { "ext", ARGS_COUNT_1 }, + { "illegal", ARGS_COUNT_0 }, + { "jmp", ARGS_COUNT_1 }, + { "jsr", ARGS_COUNT_1 }, + { "lea", ARGS_COUNT_2 }, + { "link", ARGS_COUNT_2 }, + { "lsl", ARGS_COUNT_1_2 }, + { "lsr", ARGS_COUNT_1_2 }, + { "move", ARGS_COUNT_2 }, + { "movea", ARGS_COUNT_2 }, + { "movem", ARGS_COUNT_2 }, + { "movep", ARGS_COUNT_2 }, + { "moveq", ARGS_COUNT_2 }, + { "muls", ARGS_COUNT_2 }, + { "mulu", ARGS_COUNT_2 }, + { "nbcd", ARGS_COUNT_1 }, + { "neg", ARGS_COUNT_1 }, + { "negx", ARGS_COUNT_1 }, + { "nop", ARGS_COUNT_0 }, + { "not", ARGS_COUNT_1 }, + { "or", ARGS_COUNT_2 }, + { "ori", ARGS_COUNT_2 }, + { "pea", ARGS_COUNT_1 }, + { "reset", ARGS_COUNT_0 }, + { "rol", ARGS_COUNT_1_2 }, + { "ror", ARGS_COUNT_1_2 }, + { "roxl", ARGS_COUNT_1_2 }, + { "roxr", ARGS_COUNT_1_2 }, + { "rte", ARGS_COUNT_0 }, + { "rtr", ARGS_COUNT_0 }, + { "rts", ARGS_COUNT_0 }, + { "sbcd", ARGS_COUNT_2 }, + { "st", ARGS_COUNT_1 }, + { "sf", ARGS_COUNT_1 }, + { "scc", ARGS_COUNT_1 }, + { "scs", ARGS_COUNT_1 }, + { "seq", ARGS_COUNT_1 }, + { "sge", ARGS_COUNT_1 }, + { "sgt", ARGS_COUNT_1 }, + { "shi", ARGS_COUNT_1 }, + { "sle", ARGS_COUNT_1 }, + { "sls", ARGS_COUNT_1 }, + { "slt", ARGS_COUNT_1 }, + { "smt", ARGS_COUNT_1 }, + { "sne", ARGS_COUNT_1 }, + { "spl", ARGS_COUNT_1 }, + { "svc", ARGS_COUNT_1 }, + { "svs", ARGS_COUNT_1 }, + { "stop", ARGS_COUNT_1 }, + { "sub", ARGS_COUNT_2 }, + { "suba", ARGS_COUNT_2 }, + { "subi", ARGS_COUNT_2 }, + { "subq", ARGS_COUNT_2 }, + { "subx", ARGS_COUNT_2 }, + { "swap", ARGS_COUNT_1 }, + { "tas", ARGS_COUNT_1 }, + { "trap", ARGS_COUNT_1 }, + { "trapv", ARGS_COUNT_0 }, + { "tst", ARGS_COUNT_1 }, + { "unlk", ARGS_COUNT_1 }, +}; + static bool should_be_escaped(const int c) { return c < ' ' || c == '"' || c == '\\' || c == '<' || c == '>' || c > '~'; @@ -312,7 +582,7 @@ static int fprint_string_escaped( return written; } -static const char *tok_kind_to_string(const enum token_type type) +static const char *token_type_to_string(const enum token_type type) { switch (type) { case TT_NONE: return "NONE"; @@ -332,7 +602,6 @@ static const char *tok_kind_to_string(const enum token_type type) case TT_RIGHT_SHIFT: return "RIGHT_SHIFT"; case TT_HASH: return "HASH"; case TT_BANG: return "BANG"; - case TT_DOLLAR: return "DOLLAR"; case TT_TILDE: return "TILDE"; case TT_AMPERSAND: return "AMPERSAND"; case TT_PIPE: return "PIPE"; @@ -351,13 +620,13 @@ static const char *tok_kind_to_string(const enum token_type type) case TT_COMMENT_ASTERISK: return "COMMENT"; case TT_COMMENT_SEMICOLON: return "COMMENT"; } - assert(0); - return "UNKNOWN"; + UNREACHABLE(); + return "_UNKNOWN"; } static int fprint_tok(const char *const input, struct token *token, FILE *const stream) { - int res = fprintf(stream, "%s<", tok_kind_to_string(token->type)); + int res = fprintf(stream, "%s<", token_type_to_string(token->type)); if (res == -1) { return -1; } @@ -375,6 +644,13 @@ static int fprint_tok(const char *const input, struct token *token, FILE *const return written; } +static int fwrite_token(const struct token *const token, FILE *const stream) +{ + const int res = fwrite(token, sizeof *token, 1, stream); + assert(res == 1); + return res; +} + static int lex_init(struct lex *const self) { *self = (struct lex){ @@ -384,16 +660,13 @@ static int lex_init(struct lex *const self) }; assert(self->input_stream != NULL); assert(self->tokbuf_stream != NULL); + // Place a dummy token at 0 index, so first real token will be at index 1. + // This is needed for parser, so it can use zero to indicate absence of + // token. + fwrite_token(&(struct token){TT_NONE}, self->tokbuf_stream); return OK; } -static int fwrite_token(const struct token *const token, FILE *const stream) -{ - const int res = fwrite(token, sizeof *token, 1, stream); - assert(res == 1); - return res; -} - static void lex_yield_token(struct lex *const self, const struct token *const token) { self->inside_line = (token->type != TT_NEWLINE) && (token->type != TT_ESCAPE); @@ -443,18 +716,19 @@ static const char *lex_state_error_string( case LS_COMMENT_SEMICOLON: case LS_ERROR: case LS_EOF: - assert(0); + UNREACHABLE(); break; } return "???"; } -static struct line_pos_info lex_get_line_pos_info(const struct lex *const self) +static struct line_pos_info lex_get_line_pos_info( + const struct lex *const self, const size_t cursor) { struct line_pos_info l = {0, 0, 0}; bool cr = false; // `input` is null terminated, that's why we subtract 1 here - for (size_t i = 0; i < self->input_size - 1; i++) { + for (size_t i = 0; i < cursor; i++) { const char c = self->input[i]; if (c == '\r') { cr = true; @@ -479,8 +753,8 @@ static struct line_pos_info lex_get_line_pos_info(const struct lex *const self) static int lex_yield_error(struct lex *const self, const int c) { fflush(self->input_stream); - const struct line_pos_info l = lex_get_line_pos_info(self); const size_t cursor = self->cursor; + const struct line_pos_info l = lex_get_line_pos_info(self, cursor); { // Read out the rest of the line int c; @@ -531,6 +805,9 @@ static int lex_handle_next(struct lex *const self, const int c) } else if (c == '@') { self->tok_offset = self->cursor; self->state = LS_NUMOCT; + } else if (c == '$') { + self->tok_offset = self->cursor; + self->state = LS_NUMHEX; } else if (c == '"') { self->tok_offset = self->cursor; self->state = LS_STRING; @@ -584,8 +861,6 @@ static int lex_handle_next(struct lex *const self, const int c) lex_yield_token(self, &(struct token){TT_HASH, self->cursor, 1}); } else if (c == '!') { lex_yield_token(self, &(struct token){TT_BANG, self->cursor, 1}); - } else if (c == '$') { - lex_yield_token(self, &(struct token){TT_DOLLAR, self->cursor, 1}); } else if (c == '~') { lex_yield_token(self, &(struct token){TT_TILDE, self->cursor, 1}); } else if (c == '&') { @@ -757,7 +1032,7 @@ static int lex_handle_next(struct lex *const self, const int c) case LS_ERROR: return ERR; case LS_EOF: - assert(0); + UNREACHABLE(); } return CONTINUE; } @@ -815,99 +1090,619 @@ static void lex_destroy(struct lex *const self) free(self->tokbuf); } +static enum args_count get_args_count_for_mnemonic(const enum mnemonic m) +{ + assert(m < MNEMONICS_COUNT); + return g_mnemmonics[m].args_count; +} + +static const char *mnemonic_to_string(const enum mnemonic m) +{ + assert(m < MNEMONICS_COUNT); + return g_mnemmonics[m].str; +} + +static const char *opsize_to_string(const enum opsize s) +{ + switch (s) { + case OPSIZE_NONE: return "none"; + case OPSIZE_S: return "short"; + case OPSIZE_B: return "byte"; + case OPSIZE_W: return "word"; + case OPSIZE_L: return "long"; + } + UNREACHABLE(); + return "_unknown"; +} + +static enum mnemonic get_mnemonic_from_identifier( + const char *const str, const size_t str_length) +{ + if (str_length > 7) { + return MN_NONE; + } + char mnemonic_str[8] = {0}; + for (size_t i = 0; i < str_length; i++) { + mnemonic_str[i] = tolower(str[i]); + } + // Start from 1 since - is dummy NONE + for (size_t i = 1; i < MNEMONICS_COUNT; i++) { + if (0 == strcmp(mnemonic_str, g_mnemmonics[i].str)) { + return (enum mnemonic)i; + } + } + return MN_NONE; +} + +static const char *arg_type_to_string(const enum arg_type type) +{ + switch (type) { + case ARG_NONE: return "NONE"; + case ARG_DN: return "Dn"; + case ARG_AN: return "An"; + case ARG_AN_ADDR: return "(An)"; + case ARG_AN_ADDR_INCR: return "(An)+"; + case ARG_AN_ADDR_DECR: return "-(An)"; + case ARG_AN_ADDR_16: return "(d16,An)"; + case ARG_AN_ADDR_8_XI: return "(d8,An,Xi)"; + case ARG_ADDR_WORD: return "(xxx).w"; + case ARG_ADDR_LONG: return "(xxx).l"; + case ARG_ADDR_UNSPEC: return "(xxx).?"; + case ARG_PC_ADDR_16: return "(d16,PC)"; + case ARG_PC_ADDR_8_XN: return "(d8,PC,Xn)"; + case ARG_IMMEDIATE: return "#imm"; + case ARG_SR: return "SR"; + case ARG_CCR: return "CCR"; + case ARG_USP: return "USP"; + case ARG_EXPR: return "EXPR"; + } + UNREACHABLE(); + return "_UNKNOWN"; +} + static int pars_init(struct pars *const self, const struct lex *const lex) { *self = (struct pars){ .lex = lex, .stmttab_stream = open_memstream( (char **)&self->stmttab, &self->stmttab_size), + .symtab_stream = open_memstream( + (char **)&self->symtab, &self->symtab_size), .symbuf_stream = open_memstream(&self->symbuf, &self->symbuf_size), }; assert(self->stmttab_stream != NULL); + assert(self->symtab_stream != NULL); assert(self->symbuf_stream != NULL); return OK; } -static int pars_yield_error(struct pars *const self) +static bool pars_is_eof_reached(const struct pars *const self) +{ + const size_t tokens_count = self->lex->tokbuf_size / + (sizeof *self->lex->tokbuf); + return self->cur_tok_id >= tokens_count; +} + +static const char *stmt_type_to_string(const enum stmt_type type) +{ + switch (type) { + case ST_NONE: return "NONE"; + case ST_LABEL: return "LABEL"; + case ST_INSTRUCTION: return "INSTRUCTION"; + case ST_ASSIGNMENT: return "ASSIGNMENT"; + case ST_COMMENT: return "COMMENT"; + case ST_DIRECTIVE: return "DIRECTIVE"; + } + return "_UNKNOWN"; +} + +static void fprint_arg( + const struct lex *const lex, + const struct arg *const arg, + FILE *const s) +{ + fprintf(s, "(%s", arg_type_to_string(arg->arg_type)); + switch (arg->arg_type) { + case ARG_NONE: + case ARG_DN: + case ARG_AN: + case ARG_AN_ADDR: + case ARG_AN_ADDR_INCR: + case ARG_AN_ADDR_DECR: + fprintf(s, " reg %d", arg->arg_contents.xn); + break; + case ARG_AN_ADDR_16: + fprintf(s, " reg %d", arg->arg_contents.arg_16.an); + fprintf(s, " d16 %d", arg->arg_contents.arg_16.d); + break; + case ARG_AN_ADDR_8_XI: + fprintf(s, " reg %d", arg->arg_contents.arg_8.an); + fprintf(s, " d8 %d", arg->arg_contents.arg_8.d); + fprintf(s, " xi %d", arg->arg_contents.arg_8.xi); + break; + case ARG_ADDR_WORD: + case ARG_ADDR_LONG: + case ARG_ADDR_UNSPEC: + fprintf(s, " addr %d", arg->arg_contents.addr); + break; + case ARG_PC_ADDR_16: + fprintf(s, " d16 %d", arg->arg_contents.arg_16.d); + break; + case ARG_PC_ADDR_8_XN: + fprintf(s, " d8 %d", arg->arg_contents.arg_8.d); + fprintf(s, " xn %d", arg->arg_contents.arg_8.xi); + break; + case ARG_IMMEDIATE: + fprintf(s, " value %d", arg->arg_contents.imm); + break; + case ARG_SR: + case ARG_CCR: + case ARG_USP: + case ARG_EXPR: + break; + } + fprintf(s, " raw \""); + for (size_t i = 0; i < arg->num_tokens; i++) { + const struct token token = lex->tokbuf[arg->first_token + i]; + if (token.type == TT_NEWLINE) { + break; + } + fprintf(s, "%.*s ", (int)token.length, lex->input + token.offset); + } + fprintf(s, "\")"); +} + +static int fprint_stmt( + const struct lex *const lex, + struct stmt *const stmt, + FILE *const s) +{ + assert(stmt); + fprintf(s, "(%s", stmt_type_to_string(stmt->type)); + if (stmt->label_token) { + const struct token label = lex->tokbuf[stmt->label_token]; + fprintf(s, "\n\t(label \"%.*s\")", (int)label.length, lex->input + label.offset); + } + if (stmt->type == ST_INSTRUCTION) { + fprintf(s, "\n\t(mnemonic \"%s\")", mnemonic_to_string(stmt->instruction.mnemonic)); + fprintf(s, "\n\t(size %s)", opsize_to_string(stmt->instruction.opsize)); + if (stmt->instruction.arg1.arg_type != ARG_NONE) { + fprintf(s, "\n\t(arg1 "); + fprint_arg(lex, &stmt->instruction.arg1, s); + fprintf(s, ")"); + } + if (stmt->instruction.arg2.arg_type != ARG_NONE) { + assert(stmt->instruction.arg1.arg_type != ARG_NONE); + fprintf(s, "\n\t(arg2 "); + fprint_arg(lex, &stmt->instruction.arg2, s); + fprintf(s, ")"); + } + } + if (stmt->comment_token) { + const struct token comment = lex->tokbuf[stmt->comment_token]; + fprintf(s, "\n\t(comment \"%.*s\")", (int)comment.length, lex->input + comment.offset); + } + fprintf(s, "\n\t(raw \""); + for (size_t i = 0; i < stmt->num_tokens; i++) { + const struct token token = lex->tokbuf[stmt->first_token + i]; + if (token.type == TT_NEWLINE) { + break; + } + fprintf(s, "%.*s ", (int)token.length, lex->input + token.offset); + } + fprintf(s, "\"))\n"); + return 0; +} + +static int fwrite_stmt(const struct stmt *const stmt, FILE *const stream) +{ + const int res = fwrite(stmt, sizeof *stmt, 1, stream); + assert(res == 1); + return res; +} + +static size_t find_line_length(const char *const str) +{ + for (size_t i = 0;; i++) { + const char c = str[i]; + if (c == '\n' || c == '\r' || c == '\000') { + return i; + } + } + return 0; +} + +static int pars_yield_error_str( + struct pars *const self, const struct line_pos_info l, char *const str) +{ + fprintf( + stderr, + "<stdin>:%lu:%lu: parsing error: expected %s, found '%s'\n", + l.line_num + 1, + l.column_num + 1, + "<TODO>", + str); + free(str); + const size_t line_length = find_line_length(self->lex->input + l.line_offset); + char *const line = calloc(1, line_length + 1); + memcpy(line, self->lex->input + l.line_offset, line_length); + fprintf(stderr, "%5lu | %s\n", l.line_num + 1, line); + free(line); + fputs(" | ", stderr); + for (size_t i = 0; i < l.column_num; i++) { + if (self->lex->input[l.line_offset + i] == '\t') { + fputc('\t', stderr); + } else { + fputc(' ', stderr); + } + } + fputs("^\n", stderr); + return ERR; +} + +static int pars_yield_error(struct pars *const self, const size_t token_id) +{ + const struct token token = self->lex->tokbuf[token_id]; + const struct line_pos_info l = + lex_get_line_pos_info(self->lex, token.offset); + char *const found = calloc(1, token.length + 10); + snprintf(found, token.length + 1, "%s", self->lex->input + token.offset); + return pars_yield_error_str(self, l, found); +} + +static int pars_yield_error_nesting( + struct pars *const self, + const size_t expression_start_token_id, + const size_t expression_length_tokens) { (void) self; - // TODO + (void) expression_start_token_id; + (void) expression_length_tokens; return ERR; } -static int pars_parse_label(struct pars *const self) +static int pars_yield_error_eof(struct pars *const self) { - const struct token label = self->lex->tokbuf[self->cur_tok_id++]; - if (label.type != TT_IDENTIFIER) { - return pars_yield_error(self); - } - // TODO - return OK; + (void) self; + return ERR; } -static int pars_parse_instr(struct pars *const self, const struct token mnemo) +static int pars_parse_direc( + struct pars *const self, const struct token *const dot) { (void) self; - (void) mnemo; - // TODO + (void) dot; + return pars_yield_error(self, self->cur_tok_id); +} + +enum opsize get_opsize_from_specifier(const char size_specifier) +{ + switch (tolower(size_specifier)) { + case 's': return OPSIZE_S; + case 'b': return OPSIZE_B; + case 'w': return OPSIZE_W; + case 'l': return OPSIZE_L; + } + return OPSIZE_NONE; +} + +static bool is_expression_token(const enum token_type type) +{ + switch (type) { + case TT_PLUS: return true; + case TT_MINUS: return true; + case TT_ASTERISK: return true; + case TT_SLASH: return true; + case TT_PERCENT: return true; + case TT_LEFT_SHIFT: return true; + case TT_RIGHT_SHIFT: return true; + case TT_HASH: return true; + case TT_BANG: return true; + case TT_TILDE: return true; + case TT_AMPERSAND: return true; + case TT_PIPE: return true; + case TT_CAP: return true; + case TT_IDENTIFIER: return true; + case TT_NUMDEC: return true; + case TT_NUMOCT: return true; + case TT_NUMHEX: return true; + case TT_LPAREN: return true; + case TT_RPAREN: return true; + default: return false; + } + return false; +} + +static int pars_parse_arg( + struct pars *const self, struct arg *const arg) +{ + const size_t tokens_count = self->lex->tokbuf_size / + (sizeof *self->lex->tokbuf); + const size_t first_token_id = self->cur_tok_id; + int nesting = 0; + enum arg_type arg_type = ARG_EXPR; + while (self->cur_tok_id < tokens_count) { + const size_t token_id = self->cur_tok_id; // Peek + const struct token token = self->lex->tokbuf[token_id]; + if (token.type == TT_LPAREN) { + nesting++; + } else if (token.type == TT_RPAREN) { + nesting--; + } else if (is_expression_token(token.type)) { + // TODO parse expression + } else if (nesting > 0 && token.type == TT_COMMA) { + // Comma inside parentheses is allowed + } else { + break; + } + self->cur_tok_id++; // Commit + } + if (nesting != 0) { + return pars_yield_error_nesting( + self, first_token_id, self->cur_tok_id - first_token_id); + } + if (first_token_id == self->cur_tok_id) { + // Nothing has been parsed + *arg = (struct arg){0}; + } else { + *arg = (struct arg){ + .arg_type = arg_type, + // TODO arg_contents + .first_token = first_token_id, + .num_tokens = self->cur_tok_id - first_token_id, + }; + } return OK; } -static int pars_parse_direc(struct pars *const self) +static int pars_yield_instruction( + struct pars *const self, + const size_t label_id, + const size_t comment_id, + const size_t mnemonic_id, + const enum opsize opsize, + const struct arg *const arg1, + const struct arg *const arg2) { - const char *input = self->lex->input; - const struct token direc = self->lex->tokbuf[self->cur_tok_id++]; - if (direc.type != TT_IDENTIFIER) { - return pars_yield_error(self); + const struct token mnemonic_token = self->lex->tokbuf[mnemonic_id]; + const enum mnemonic mnemonic = get_mnemonic_from_identifier( + self->lex->input + mnemonic_token.offset, mnemonic_token.length); + if (mnemonic == MN_NONE) { + return pars_yield_error(self, mnemonic_id); + } + if (arg2) { + assert(arg1); } - if (0 == strcmp(input + direc.offset, "def")) { - } else if (0 == strcmp(input + direc.offset, "opt")) { - } else if (0 == strcmp(input + direc.offset, "file")) { - } else if (0 == strcmp(input + direc.offset, "text")) { - } else if (0 == strcmp(input + direc.offset, "align")) { - } else if (0 == strcmp(input + direc.offset, "globl")) { - } else if (0 == strcmp(input + direc.offset, "ln")) { - } else if (0 == strcmp(input + direc.offset, "long")) { - } else if (0 == strcmp(input + direc.offset, "word")) { - } else if (0 == strcmp(input + direc.offset, "byte")) { - } else if (0 == strcmp(input + direc.offset, "bin")) { + const enum args_count args_count = get_args_count_for_mnemonic(mnemonic); + // Validate instruction arguments count + switch (args_count) { + case ARGS_COUNT_UNKNOWN: + UNREACHABLE(); + break; + case ARGS_COUNT_0: + if (arg1) { + return pars_yield_error(self, arg1->first_token); + } + break; + case ARGS_COUNT_0_1: + if (arg2) { + return pars_yield_error(self, arg2->first_token); + } + break; + case ARGS_COUNT_0_1_2: + break; + case ARGS_COUNT_0_2: + if (arg1 && !arg2) { + return pars_yield_error(self, mnemonic_id); + } + break; + case ARGS_COUNT_1: + if (!arg1) { + return pars_yield_error(self, mnemonic_id); + } else if (arg2) { + return pars_yield_error(self, arg2->first_token); + } + break; + case ARGS_COUNT_1_2: + if (!arg1) { + return pars_yield_error(self, mnemonic_id); + } + break; + case ARGS_COUNT_2: + if (!arg1 || !arg2) { + return pars_yield_error(self, mnemonic_id); + } + break; } - // TODO + const size_t first_token_id = label_id ? label_id : mnemonic_id; + const struct stmt stmt = { + .type = ST_INSTRUCTION, + .instruction = { + .mnemonic = mnemonic, + .opsize = opsize, + .arg1 = arg1 ? *arg1 : (struct arg){0}, + .arg2 = arg2 ? *arg2 : (struct arg){0}, + }, + .label_token = label_id, + .comment_token = comment_id, + .first_token = first_token_id, + .num_tokens = self->cur_tok_id - first_token_id, + }; + fwrite_stmt(&stmt, self->stmttab_stream); return OK; } -static int pars_parse_instr_or_direc(struct pars *const self) +static int pars_parse_instruction_comment( + struct pars *const self, + const size_t label_id, + const size_t mnemonic_id, + const enum opsize opsize, + const struct arg *const arg1, + const struct arg *const arg2) { - struct token token; - if (token.type == TT_DOT) { - return pars_parse_direc(self); - } else if (token.type == TT_IDENTIFIER) { - return pars_parse_instr(self, token); - } else { - return pars_yield_error(self); + size_t comment_id = 0; + if (!pars_is_eof_reached(self)) { + // Try parse comment + const size_t token1_id = self->cur_tok_id; // Peek comment + const struct token token1 = self->lex->tokbuf[token1_id]; + const bool is_comment = token1.type == TT_COMMENT_ASTERISK || + token1.type == TT_COMMENT_SEMICOLON; + if (is_comment) { + self->cur_tok_id++; // Commit comment + comment_id = token1_id; + } + if (!pars_is_eof_reached(self)) { + // Handle new line + const size_t nl_id = self->cur_tok_id++; // Commit new line + const struct token nl = self->lex->tokbuf[nl_id]; + if (nl.type != TT_NEWLINE) { + return pars_yield_error(self, nl_id); + } + } + } + return pars_yield_instruction( + self, label_id, comment_id, mnemonic_id, opsize, arg1, arg2); +} + +static int pars_parse_instruction_args( + struct pars *const self, + const size_t label_id, + const size_t mnemonic_id, + const enum opsize opsize) +{ + struct arg arg1, arg2; + // Try parse first argument + const int res1 = pars_parse_arg(self, &arg1); + if (res1 != OK) { + return res1; + } + if (arg1.arg_type == ARG_NONE) { + return pars_parse_instruction_comment( + self, label_id, mnemonic_id, opsize, NULL, NULL); + } + if (pars_is_eof_reached(self)) { + return pars_yield_instruction( + self, label_id, 0, mnemonic_id, opsize, &arg1, NULL); + } + const size_t comma_id = self->cur_tok_id; // Peek comma + const struct token comma = self->lex->tokbuf[comma_id]; + if (comma.type != TT_COMMA) { + return pars_parse_instruction_comment( + self, label_id, mnemonic_id, opsize, NULL, NULL); + } + self->cur_tok_id++; // Commit comma + // Try parse second argument + const int res2 = pars_parse_arg(self, &arg2); + if (res2 != OK) { + return res2; + } + if (pars_is_eof_reached(self)) { + return pars_yield_instruction( + self, label_id, 0, mnemonic_id, opsize, &arg1, &arg2); + } + // Finish parsing instruction, expect comment or newline + return pars_parse_instruction_comment( + self, label_id, mnemonic_id, opsize, &arg1, &arg2); +} + +static int pars_parse_instruction( + struct pars *const self, + const size_t label_id, + const size_t mnemonic_id) +{ + if (pars_is_eof_reached(self)) { + return pars_yield_error_eof(self); + } + const size_t token2_id = self->cur_tok_id; // Peek + const struct token token2 = self->lex->tokbuf[token2_id]; + if (token2.type == TT_DOT) { + self->cur_tok_id++; // Commit + if (pars_is_eof_reached(self)) { + return pars_yield_error_eof(self); + } + const size_t size_spec_id = self->cur_tok_id++; + const struct token size_spec = self->lex->tokbuf[size_spec_id]; + if (size_spec.type != TT_IDENTIFIER) { + return pars_yield_error(self, size_spec_id); + } + // Size specifier + if (size_spec.length != 1) { + return pars_yield_error(self, size_spec_id); + } + const size_t opsize = + get_opsize_from_specifier(self->lex->input[size_spec.offset]); + if (opsize == OPSIZE_NONE) { + return pars_yield_error(self, size_spec_id); + } + if (pars_is_eof_reached(self)) { + return pars_yield_error_eof(self); + } + return pars_parse_instruction_args(self, label_id, mnemonic_id, opsize); + } + return pars_parse_instruction_args( + self, label_id, mnemonic_id, OPSIZE_NONE); +} + +static int pars_parse_assignment( + struct pars *const self, const size_t label_id, const size_t symbol_id) +{ + (void) label_id; + (void) symbol_id; + return pars_yield_error(self, self->cur_tok_id); +} + +static int pars_yield_label_comment( + struct pars *const self, const size_t label_id, const size_t comment_id) +{ + if (label_id || comment_id) { + const size_t first_token = label_id ? label_id : comment_id; + struct stmt stmt = { + .type = label_id ? ST_LABEL : ST_COMMENT, + .label_token = label_id, + .comment_token = comment_id, + .first_token = first_token, + .num_tokens = self->cur_tok_id - first_token, + }; + fwrite_stmt(&stmt, self->stmttab_stream); } return OK; } -static int pars_parse_statement(struct pars *const self) +static int pars_parse_labeled_statement( + struct pars *const self, const size_t label_id) { - const size_t tokens_count = self->lex->tokbuf_size / - (sizeof *self->lex->tokbuf); - const struct token token = self->lex->tokbuf[self->cur_tok_id++]; - const bool is_comment = token.type == TT_COMMENT_ASTERISK || - token.type == TT_COMMENT_SEMICOLON; - if (token.type == TT_IDENTIFIER) { - return pars_parse_label(self); - } else if (!is_comment) { - if (self->cur_tok_id < tokens_count) { - const struct token nl = self->lex->tokbuf[self->cur_tok_id++]; - assert(nl.type == TT_NEWLINE); + const size_t token1_id = self->cur_tok_id++; + const struct token token1 = self->lex->tokbuf[token1_id]; + const bool is_comment = token1.type == TT_COMMENT_ASTERISK || + token1.type == TT_COMMENT_SEMICOLON; + if (is_comment) { + return pars_yield_label_comment(self, label_id, token1_id); + } else if (token1.type == TT_NEWLINE) { + return pars_yield_label_comment(self, label_id, 0); + } else if (token1.type == TT_IDENTIFIER) { + if (pars_is_eof_reached(self)) { + return pars_yield_error_eof(self); } - return OK; - } else if (token.type == TT_NEWLINE) { - return OK; + const size_t token2_id = self->cur_tok_id; // Peek + const struct token token2 = self->lex->tokbuf[token2_id]; + if (!label_id && token2.type == TT_COLON) { + self->cur_tok_id++; // Commit + return pars_parse_labeled_statement(self, token1_id); + } else if (token2.type == TT_EQ || token2.type == TT_EQ_DOUBLE) { + self->cur_tok_id++; // Commit + return pars_parse_assignment(self, label_id, token2_id); + } else { + return pars_parse_instruction(self, label_id, token1_id); + } + } else if (token1.type == TT_DOT) { + return pars_parse_direc(self, &token1); } - return pars_yield_error(self); + return pars_yield_error(self, token1_id); +} + +static int pars_parse_statement(struct pars *const self) +{ + return pars_parse_labeled_statement(self, 0); } /** Run parser until the end of the input reached @@ -916,22 +1711,31 @@ static int pars_parse_statement(struct pars *const self) */ static int pars_run(struct pars *const self) { - return OK; const size_t tokens_count = self->lex->tokbuf_size / (sizeof *self->lex->tokbuf); - do { - const int ret = pars_parse_statement(self); + // Skip dummy token at position 0 + self->cur_tok_id = 1; + // Leave dummy statement at position 0 + fwrite_stmt(&(struct stmt){0}, self->stmttab_stream); + int ret = OK; + while (self->cur_tok_id < tokens_count) { + ret = pars_parse_statement(self); if (ret != OK) { - return ret; + break; } - } while (self->cur_tok_id < tokens_count); - return OK; + } + fflush(self->stmttab_stream); + fflush(self->symtab_stream); + fflush(self->symbuf_stream); + return ret; } static void pars_destroy(struct pars *const self) { fclose(self->stmttab_stream); free(self->stmttab); + fclose(self->symtab_stream); + free(self->symtab); fclose(self->symbuf_stream); free(self->symbuf); } @@ -952,12 +1756,18 @@ static int assem_resolve(struct assem *const self) static int assem_emit(struct assem *const self, FILE *const stream) { - if (TRACE_LEX) { - const struct lex *const lex = self->pars->lex; - for (size_t i = 0; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) { + const struct lex *const lex = self->pars->lex; + const struct pars *const pars = self->pars; + if (TRACE_LEXER) { + for (size_t i = 1; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) { fprint_tok(lex->input, &lex->tokbuf[i], stream); } } + if (TRACE_PARSER) { + for (size_t i = 1; i < pars->stmttab_size / (sizeof *pars->stmttab); i++) { + fprint_stmt(lex, pars->stmttab + i, stream); + } + } return OK; } |