diff options
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | main.c | 1024 | 
2 files changed, 918 insertions, 108 deletions
| @@ -2,7 +2,7 @@  WARNFLAGS = -Wall -Wextra -pedantic -Wlogical-op  INCLUDES = lib -_FLAGS = -O2 -fsanitize=address +_FLAGS = -O2 -fsanitize=unreachable -fsanitize=address  _CFLAGS = $(CFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(_FLAGS) -pipe -g  _CXXFLAGS = $(CXXFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(_FLAGS) -pipe -g  LDSCRIPTS = @@ -6,6 +6,7 @@   */  #include <assert.h> +#include <ctype.h>  #include <errno.h>  #include <stdbool.h>  #include <stdint.h> @@ -13,8 +14,18 @@  #include <stdlib.h>  #include <string.h> -#ifndef TRACE_LEX -#define TRACE_LEX 1 +#ifndef TRACE_LEXER +#define TRACE_LEXER 0 +#endif + +#ifndef TRACE_PARSER +#define TRACE_PARSER 1 +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define UNREACHABLE __builtin_unreachable +#else +#define UNREACHABLE()  #endif  #define ERR 0 @@ -39,7 +50,6 @@ enum token_type {      TT_RIGHT_SHIFT,      TT_HASH,      TT_BANG, -    TT_DOLLAR,      TT_TILDE,      TT_AMPERSAND,      TT_PIPE, @@ -112,22 +122,132 @@ struct lex {      size_t tokbuf_size;  }; -enum stmt_kind { -    SK_NONE = 0, -    SK_LABEL, -    SK_INSTRUCTION, -    SK_TEXT, -    SK_DIR_FILE, -    SK_DIR_TEXT, -    SK_DIR_ALIGN, -    SK_DIR_DEF_ENDEF, -    SK_DIR_GLOBL, -    SK_DIR_LINE, +enum stmt_type { +    ST_NONE = 0, +    ST_LABEL, +    ST_INSTRUCTION, +    ST_ASSIGNMENT, +    ST_COMMENT, +    ST_DIRECTIVE,  };  enum mnemonic { -    OPCODE_NONE, -    OPCODE_NOP, +    MN_NONE = 0, +    MN_ABCD, +    MN_ADD, +    MN_ADDA, +    MN_ADDI, +    MN_ADDQ, +    MN_ADDX, +    MN_AND, +    MN_ANDI, +    MN_ASL, +    MN_BRA, +    MN_BSR, +    MN_BCC, +    MN_BCS, +    MN_BEQ, +    MN_BGE, +    MN_BGT, +    MN_BHI, +    MN_BLE, +    MN_BLS, +    MN_BLT, +    MN_BMT, +    MN_BNE, +    MN_BPL, +    MN_BVC, +    MN_BVS, +    MN_BCHG, +    MN_BCLR, +    MN_BSET, +    MN_CHK, +    MN_CLR, +    MN_CMP, +    MN_CMPA, +    MN_CMPI, +    MN_CMPM, +    MN_DBT, +    MN_DBF, +    MN_DBCC, +    MN_DBCS, +    MN_DBEQ, +    MN_DBGE, +    MN_DBGT, +    MN_DBHI, +    MN_DBLE, +    MN_DBLS, +    MN_DBLT, +    MN_DBMT, +    MN_DBNE, +    MN_DBPL, +    MN_DBVC, +    MN_DBVS, +    MN_DIVU, +    MN_DIVS, +    MN_EORI, +    MN_EXG, +    MN_EXT, +    MN_ILLEGAL, +    MN_JMP, +    MN_JSR, +    MN_LEA, +    MN_LINK, +    MN_LSL, +    MN_LSR, +    MN_MOVE, +    MN_MOVEA, +    MN_MOVEM, +    MN_MOVEP, +    MN_MOVEQ, +    MN_MULS, +    MN_MULU, +    MN_NBCD, +    MN_NEG, +    MN_NEGX, +    MN_NOP, +    MN_NOT, +    MN_OR, +    MN_ORI, +    MN_PEA, +    MN_RESET, +    MN_ROL, +    MN_ROR, +    MN_ROXL, +    MN_ROXR, +    MN_RTE, +    MN_RTR, +    MN_RTS, +    MN_SBCD, +    MN_ST, +    MN_SF, +    MN_SCC, +    MN_SCS, +    MN_SEQ, +    MN_SGE, +    MN_SGT, +    MN_SHI, +    MN_SLE, +    MN_SLS, +    MN_SLT, +    MN_SMT, +    MN_SNE, +    MN_SPL, +    MN_SVC, +    MN_SVS, +    MN_STOP, +    MN_SUB, +    MN_SUBA, +    MN_SUBI, +    MN_SUBQ, +    MN_SUBX, +    MN_SWAP, +    MN_TAS, +    MN_TRAP, +    MN_TRAPV, +    MN_TST, +    MN_UNLK, +    MNEMONICS_COUNT,  };  enum opsize { @@ -138,7 +258,7 @@ enum opsize {      OPSIZE_L,  }; -enum arg_kind { +enum arg_type {      ARG_NONE = 0,      ARG_DN,      ARG_AN, @@ -146,29 +266,57 @@ enum arg_kind {      ARG_AN_ADDR_INCR,      ARG_AN_ADDR_DECR,      ARG_AN_ADDR_16, -    ARG_AN_ADDR_8_XN, +    ARG_AN_ADDR_8_XI,      ARG_ADDR_WORD,      ARG_ADDR_LONG,      ARG_ADDR_UNSPEC,      ARG_PC_ADDR_16,      ARG_PC_ADDR_8_XN,      ARG_IMMEDIATE, +    ARG_SR, +    ARG_CCR, +    ARG_USP, +    ARG_EXPR,  }; -struct arg_8_xn { -    int8_t val; +enum args_count { +    ARGS_COUNT_UNKNOWN = 0, +    ARGS_COUNT_0, +    ARGS_COUNT_0_1, +    ARGS_COUNT_0_1_2, +    ARGS_COUNT_0_2, +    ARGS_COUNT_1, +    ARGS_COUNT_1_2, +    ARGS_COUNT_2, +}; + +struct arg_16 { +    int16_t d; +    int8_t an; +}; + +struct arg_8 { +    int8_t d;      int8_t an;      int8_t xi;  }; +union arg_contents { +    int32_t imm, addr, xn; +    struct arg_16 arg_16; // For (d16,An) and (d16,PC) +    struct arg_8 arg_8; // For (d8,An,Xi) and (d8,PC,Xn) +}; + +struct arg { +    enum arg_type arg_type; +    union arg_contents arg_contents; +    size_t first_token, num_tokens; // Expression tokens span, may be NULL +}; +  struct instruction {      enum mnemonic mnemonic;      enum opsize opsize; -    enum arg_kind arg1_kind, arg2_kind; -    union { -        int32_t imm, addr; -        struct arg_8_xn arg_8_xn; // For (d,An,Xi) and (d,PC,Xn) -    } arg1, arg2; +    struct arg arg1, arg2;  };  struct def_endef { @@ -180,13 +328,14 @@ struct def_endef {  };  struct stmt { -    enum stmt_kind type; +    enum stmt_type type;      union {          struct instruction instruction;          int32_t align;          size_t globl_sym_id;          size_t file_sym_id;      }; +    size_t label_token;      size_t first_token, num_tokens; // Statement tokens span, may be NULL      size_t comment_token;  }; @@ -257,6 +406,127 @@ const char *const g_escape_table[256] = {      "\\xfd", "\\xfe",  }; +struct mnemonic_meta { +    const char *str; +    enum args_count args_count; +} g_mnemmonics[MNEMONICS_COUNT] = { +    { "none",   ARGS_COUNT_0 }, +    { "abcd",   ARGS_COUNT_2 }, +    { "add",    ARGS_COUNT_2 }, +    { "adda",   ARGS_COUNT_2 }, +    { "addi",   ARGS_COUNT_2 }, +    { "addq",   ARGS_COUNT_2 }, +    { "addx",   ARGS_COUNT_2 }, +    { "and",    ARGS_COUNT_2 }, +    { "andi",   ARGS_COUNT_2 }, +    { "asl",    ARGS_COUNT_1_2 }, +    { "bra",    ARGS_COUNT_1 }, +    { "bsr",    ARGS_COUNT_1 }, +    { "bcc",    ARGS_COUNT_1 }, +    { "bcs",    ARGS_COUNT_1 }, +    { "beq",    ARGS_COUNT_1 }, +    { "bge",    ARGS_COUNT_1 }, +    { "bgt",    ARGS_COUNT_1 }, +    { "bhi",    ARGS_COUNT_1 }, +    { "ble",    ARGS_COUNT_1 }, +    { "bls",    ARGS_COUNT_1 }, +    { "blt",    ARGS_COUNT_1 }, +    { "bmt",    ARGS_COUNT_1 }, +    { "bne",    ARGS_COUNT_1 }, +    { "bpl",    ARGS_COUNT_1 }, +    { "bvc",    ARGS_COUNT_1 }, +    { "bvs",    ARGS_COUNT_1 }, +    { "bchg",   ARGS_COUNT_2 }, +    { "bclr",   ARGS_COUNT_2 }, +    { "bset",   ARGS_COUNT_2 }, +    { "chk",    ARGS_COUNT_2 }, +    { "clr",    ARGS_COUNT_1 }, +    { "cmp",    ARGS_COUNT_2 }, +    { "cmpa",   ARGS_COUNT_2 }, +    { "cmpi",   ARGS_COUNT_2 }, +    { "cmpm",   ARGS_COUNT_2 }, +    { "dbt",    ARGS_COUNT_2 }, +    { "dbf",    ARGS_COUNT_2 }, +    { "dbcc",   ARGS_COUNT_2 }, +    { "dbcs",   ARGS_COUNT_2 }, +    { "dbeq",   ARGS_COUNT_2 }, +    { "dbge",   ARGS_COUNT_2 }, +    { "dbgt",   ARGS_COUNT_2 }, +    { "dbhi",   ARGS_COUNT_2 }, +    { "dble",   ARGS_COUNT_2 }, +    { "dbls",   ARGS_COUNT_2 }, +    { "dblt",   ARGS_COUNT_2 }, +    { "dbmt",   ARGS_COUNT_2 }, +    { "dbne",   ARGS_COUNT_2 }, +    { "dbpl",   ARGS_COUNT_2 }, +    { "dbvc",   ARGS_COUNT_2 }, +    { "dbvs",   ARGS_COUNT_2 }, +    { "divu",   ARGS_COUNT_2 }, +    { "divs",   ARGS_COUNT_2 }, +    { "eori",   ARGS_COUNT_2 }, +    { "exg",    ARGS_COUNT_2 }, +    { "ext",    ARGS_COUNT_1 }, +    { "illegal", ARGS_COUNT_0 }, +    { "jmp",    ARGS_COUNT_1 }, +    { "jsr",    ARGS_COUNT_1 }, +    { "lea",    ARGS_COUNT_2 }, +    { "link",   ARGS_COUNT_2 }, +    { "lsl",    ARGS_COUNT_1_2 }, +    { "lsr",    ARGS_COUNT_1_2 }, +    { "move",   ARGS_COUNT_2 }, +    { "movea",  ARGS_COUNT_2 }, +    { "movem",  ARGS_COUNT_2 }, +    { "movep",  ARGS_COUNT_2 }, +    { "moveq",  ARGS_COUNT_2 }, +    { "muls",   ARGS_COUNT_2 }, +    { "mulu",   ARGS_COUNT_2 }, +    { "nbcd",   ARGS_COUNT_1 }, +    { "neg",    ARGS_COUNT_1 }, +    { "negx",   ARGS_COUNT_1 }, +    { "nop",    ARGS_COUNT_0 }, +    { "not",    ARGS_COUNT_1 }, +    { "or",     ARGS_COUNT_2 }, +    { "ori",    ARGS_COUNT_2 }, +    { "pea",    ARGS_COUNT_1 }, +    { "reset",  ARGS_COUNT_0 }, +    { "rol",    ARGS_COUNT_1_2 }, +    { "ror",    ARGS_COUNT_1_2 }, +    { "roxl",   ARGS_COUNT_1_2 }, +    { "roxr",   ARGS_COUNT_1_2 }, +    { "rte",    ARGS_COUNT_0 }, +    { "rtr",    ARGS_COUNT_0 }, +    { "rts",    ARGS_COUNT_0 }, +    { "sbcd",   ARGS_COUNT_2 }, +    { "st",     ARGS_COUNT_1 }, +    { "sf",     ARGS_COUNT_1 }, +    { "scc",    ARGS_COUNT_1 }, +    { "scs",    ARGS_COUNT_1 }, +    { "seq",    ARGS_COUNT_1 }, +    { "sge",    ARGS_COUNT_1 }, +    { "sgt",    ARGS_COUNT_1 }, +    { "shi",    ARGS_COUNT_1 }, +    { "sle",    ARGS_COUNT_1 }, +    { "sls",    ARGS_COUNT_1 }, +    { "slt",    ARGS_COUNT_1 }, +    { "smt",    ARGS_COUNT_1 }, +    { "sne",    ARGS_COUNT_1 }, +    { "spl",    ARGS_COUNT_1 }, +    { "svc",    ARGS_COUNT_1 }, +    { "svs",    ARGS_COUNT_1 }, +    { "stop",   ARGS_COUNT_1 }, +    { "sub",    ARGS_COUNT_2 }, +    { "suba",   ARGS_COUNT_2 }, +    { "subi",   ARGS_COUNT_2 }, +    { "subq",   ARGS_COUNT_2 }, +    { "subx",   ARGS_COUNT_2 }, +    { "swap",   ARGS_COUNT_1 }, +    { "tas",    ARGS_COUNT_1 }, +    { "trap",   ARGS_COUNT_1 }, +    { "trapv",  ARGS_COUNT_0 }, +    { "tst",    ARGS_COUNT_1 }, +    { "unlk",   ARGS_COUNT_1 }, +}; +  static bool should_be_escaped(const int c)  {      return c < ' ' || c == '"' || c == '\\' || c == '<' || c == '>' || c > '~'; @@ -312,7 +582,7 @@ static int fprint_string_escaped(      return written;  } -static const char *tok_kind_to_string(const enum token_type type) +static const char *token_type_to_string(const enum token_type type)  {      switch (type) {      case TT_NONE: return "NONE"; @@ -332,7 +602,6 @@ static const char *tok_kind_to_string(const enum token_type type)      case TT_RIGHT_SHIFT: return "RIGHT_SHIFT";      case TT_HASH: return "HASH";      case TT_BANG: return "BANG"; -    case TT_DOLLAR: return "DOLLAR";      case TT_TILDE: return "TILDE";      case TT_AMPERSAND: return "AMPERSAND";      case TT_PIPE: return "PIPE"; @@ -351,13 +620,13 @@ static const char *tok_kind_to_string(const enum token_type type)      case TT_COMMENT_ASTERISK: return "COMMENT";      case TT_COMMENT_SEMICOLON: return "COMMENT";      } -    assert(0); -    return "UNKNOWN"; +    UNREACHABLE(); +    return "_UNKNOWN";  }  static int fprint_tok(const char *const input, struct token *token, FILE *const stream)  { -    int res = fprintf(stream, "%s<", tok_kind_to_string(token->type)); +    int res = fprintf(stream, "%s<", token_type_to_string(token->type));      if (res == -1) {          return -1;      } @@ -375,6 +644,13 @@ static int fprint_tok(const char *const input, struct token *token, FILE *const      return written;  } +static int fwrite_token(const struct token *const token, FILE *const stream) +{ +    const int res = fwrite(token, sizeof *token, 1, stream); +    assert(res == 1); +    return res; +} +  static int lex_init(struct lex *const self)  {      *self = (struct lex){ @@ -384,16 +660,13 @@ static int lex_init(struct lex *const self)      };      assert(self->input_stream != NULL);      assert(self->tokbuf_stream != NULL); +    // Place a dummy token at 0 index, so first real token will be at index 1. +    // This is needed for parser, so it can use zero to indicate absence of +    // token. +    fwrite_token(&(struct token){TT_NONE}, self->tokbuf_stream);      return OK;  } -static int fwrite_token(const struct token *const token, FILE *const stream) -{ -    const int res = fwrite(token, sizeof *token, 1, stream); -    assert(res == 1); -    return res; -} -  static void lex_yield_token(struct lex *const self, const struct token *const token)  {      self->inside_line = (token->type != TT_NEWLINE) && (token->type != TT_ESCAPE); @@ -443,18 +716,19 @@ static const char *lex_state_error_string(      case LS_COMMENT_SEMICOLON:      case LS_ERROR:      case LS_EOF: -        assert(0); +        UNREACHABLE();          break;      }      return "???";  } -static struct line_pos_info lex_get_line_pos_info(const struct lex *const self) +static struct line_pos_info lex_get_line_pos_info( +        const struct lex *const self, const size_t cursor)  {      struct line_pos_info l = {0, 0, 0};      bool cr = false;      // `input` is null terminated, that's why we subtract 1 here -    for (size_t i = 0; i < self->input_size - 1; i++) { +    for (size_t i = 0; i < cursor; i++) {          const char c = self->input[i];          if (c == '\r') {              cr = true; @@ -479,8 +753,8 @@ static struct line_pos_info lex_get_line_pos_info(const struct lex *const self)  static int lex_yield_error(struct lex *const self, const int c)  {      fflush(self->input_stream); -    const struct line_pos_info l = lex_get_line_pos_info(self);      const size_t cursor = self->cursor; +    const struct line_pos_info l = lex_get_line_pos_info(self, cursor);      {          // Read out the rest of the line          int c; @@ -531,6 +805,9 @@ static int lex_handle_next(struct lex *const self, const int c)          } else if (c == '@') {              self->tok_offset = self->cursor;              self->state = LS_NUMOCT; +        } else if (c == '$') { +            self->tok_offset = self->cursor; +            self->state = LS_NUMHEX;          } else if (c == '"') {              self->tok_offset = self->cursor;              self->state = LS_STRING; @@ -584,8 +861,6 @@ static int lex_handle_next(struct lex *const self, const int c)              lex_yield_token(self, &(struct token){TT_HASH, self->cursor, 1});          } else if (c == '!') {              lex_yield_token(self, &(struct token){TT_BANG, self->cursor, 1}); -        } else if (c == '$') { -            lex_yield_token(self, &(struct token){TT_DOLLAR, self->cursor, 1});          } else if (c == '~') {              lex_yield_token(self, &(struct token){TT_TILDE, self->cursor, 1});          } else if (c == '&') { @@ -757,7 +1032,7 @@ static int lex_handle_next(struct lex *const self, const int c)      case LS_ERROR:          return ERR;      case LS_EOF: -        assert(0); +        UNREACHABLE();      }      return CONTINUE;  } @@ -815,99 +1090,619 @@ static void lex_destroy(struct lex *const self)      free(self->tokbuf);  } +static enum args_count get_args_count_for_mnemonic(const enum mnemonic m) +{ +    assert(m < MNEMONICS_COUNT); +    return g_mnemmonics[m].args_count; +} + +static const char *mnemonic_to_string(const enum mnemonic m) +{ +    assert(m < MNEMONICS_COUNT); +    return g_mnemmonics[m].str; +} + +static const char *opsize_to_string(const enum opsize s) +{ +    switch (s) { +    case OPSIZE_NONE: return "none"; +    case OPSIZE_S: return "short"; +    case OPSIZE_B: return "byte"; +    case OPSIZE_W: return "word"; +    case OPSIZE_L: return "long"; +    } +    UNREACHABLE(); +    return "_unknown"; +} + +static enum mnemonic get_mnemonic_from_identifier( +        const char *const str, const size_t str_length) +{ +    if (str_length > 7) { +        return MN_NONE; +    } +    char mnemonic_str[8] = {0}; +    for (size_t i = 0; i < str_length; i++) { +        mnemonic_str[i] = tolower(str[i]); +    } +    // Start from 1 since - is dummy NONE +    for (size_t i = 1; i < MNEMONICS_COUNT; i++) { +        if (0 == strcmp(mnemonic_str, g_mnemmonics[i].str)) { +            return (enum mnemonic)i; +        } +    } +    return MN_NONE; +} + +static const char *arg_type_to_string(const enum arg_type type) +{ +    switch (type) { +    case ARG_NONE: return "NONE"; +    case ARG_DN: return "Dn"; +    case ARG_AN: return "An"; +    case ARG_AN_ADDR: return "(An)"; +    case ARG_AN_ADDR_INCR: return "(An)+"; +    case ARG_AN_ADDR_DECR: return "-(An)"; +    case ARG_AN_ADDR_16: return "(d16,An)"; +    case ARG_AN_ADDR_8_XI: return "(d8,An,Xi)"; +    case ARG_ADDR_WORD: return "(xxx).w"; +    case ARG_ADDR_LONG: return "(xxx).l"; +    case ARG_ADDR_UNSPEC: return "(xxx).?"; +    case ARG_PC_ADDR_16: return "(d16,PC)"; +    case ARG_PC_ADDR_8_XN: return "(d8,PC,Xn)"; +    case ARG_IMMEDIATE: return "#imm"; +    case ARG_SR: return "SR"; +    case ARG_CCR: return "CCR"; +    case ARG_USP: return "USP"; +    case ARG_EXPR: return "EXPR"; +    } +    UNREACHABLE(); +    return "_UNKNOWN"; +} +  static int pars_init(struct pars *const self, const struct lex *const lex)  {      *self = (struct pars){          .lex = lex,          .stmttab_stream = open_memstream(                  (char **)&self->stmttab, &self->stmttab_size), +        .symtab_stream = open_memstream( +                (char **)&self->symtab, &self->symtab_size),          .symbuf_stream = open_memstream(&self->symbuf, &self->symbuf_size),      };      assert(self->stmttab_stream != NULL); +    assert(self->symtab_stream != NULL);      assert(self->symbuf_stream != NULL);      return OK;  } -static int pars_yield_error(struct pars *const self) +static bool pars_is_eof_reached(const struct pars *const self) +{ +    const size_t tokens_count = self->lex->tokbuf_size / +        (sizeof *self->lex->tokbuf); +    return self->cur_tok_id >= tokens_count; +} + +static const char *stmt_type_to_string(const enum stmt_type type) +{ +    switch (type) { +    case ST_NONE: return "NONE"; +    case ST_LABEL: return "LABEL"; +    case ST_INSTRUCTION: return "INSTRUCTION"; +    case ST_ASSIGNMENT: return "ASSIGNMENT"; +    case ST_COMMENT: return "COMMENT"; +    case ST_DIRECTIVE: return "DIRECTIVE"; +    } +    return "_UNKNOWN"; +} + +static void fprint_arg( +        const struct lex *const lex, +        const struct arg *const arg, +        FILE *const s) +{ +    fprintf(s, "(%s", arg_type_to_string(arg->arg_type)); +    switch (arg->arg_type) { +    case ARG_NONE: +    case ARG_DN: +    case ARG_AN: +    case ARG_AN_ADDR: +    case ARG_AN_ADDR_INCR: +    case ARG_AN_ADDR_DECR: +        fprintf(s, " reg %d", arg->arg_contents.xn); +        break; +    case ARG_AN_ADDR_16: +        fprintf(s, " reg %d", arg->arg_contents.arg_16.an); +        fprintf(s, " d16 %d", arg->arg_contents.arg_16.d); +        break; +    case ARG_AN_ADDR_8_XI: +        fprintf(s, " reg %d", arg->arg_contents.arg_8.an); +        fprintf(s, " d8 %d", arg->arg_contents.arg_8.d); +        fprintf(s, " xi %d", arg->arg_contents.arg_8.xi); +        break; +    case ARG_ADDR_WORD: +    case ARG_ADDR_LONG: +    case ARG_ADDR_UNSPEC: +        fprintf(s, " addr %d", arg->arg_contents.addr); +        break; +    case ARG_PC_ADDR_16: +        fprintf(s, " d16 %d", arg->arg_contents.arg_16.d); +        break; +    case ARG_PC_ADDR_8_XN: +        fprintf(s, " d8 %d", arg->arg_contents.arg_8.d); +        fprintf(s, " xn %d", arg->arg_contents.arg_8.xi); +        break; +    case ARG_IMMEDIATE: +        fprintf(s, " value %d", arg->arg_contents.imm); +        break; +    case ARG_SR: +    case ARG_CCR: +    case ARG_USP: +    case ARG_EXPR: +        break; +    } +    fprintf(s, " raw \""); +    for (size_t i = 0; i < arg->num_tokens; i++) { +        const struct token token = lex->tokbuf[arg->first_token + i]; +        if (token.type == TT_NEWLINE) { +            break; +        } +        fprintf(s, "%.*s ", (int)token.length, lex->input + token.offset); +    } +    fprintf(s, "\")"); +} + +static int fprint_stmt( +        const struct lex *const lex, +        struct stmt *const stmt, +        FILE *const s) +{ +    assert(stmt); +    fprintf(s, "(%s", stmt_type_to_string(stmt->type)); +    if (stmt->label_token) { +        const struct token label = lex->tokbuf[stmt->label_token]; +        fprintf(s, "\n\t(label \"%.*s\")", (int)label.length, lex->input + label.offset); +    } +    if (stmt->type == ST_INSTRUCTION) { +        fprintf(s, "\n\t(mnemonic \"%s\")", mnemonic_to_string(stmt->instruction.mnemonic)); +        fprintf(s, "\n\t(size %s)", opsize_to_string(stmt->instruction.opsize)); +        if (stmt->instruction.arg1.arg_type != ARG_NONE) { +            fprintf(s, "\n\t(arg1 "); +            fprint_arg(lex, &stmt->instruction.arg1, s); +            fprintf(s, ")"); +        } +        if (stmt->instruction.arg2.arg_type != ARG_NONE) { +            assert(stmt->instruction.arg1.arg_type != ARG_NONE); +            fprintf(s, "\n\t(arg2 "); +            fprint_arg(lex, &stmt->instruction.arg2, s); +            fprintf(s, ")"); +        } +    } +    if (stmt->comment_token) { +        const struct token comment = lex->tokbuf[stmt->comment_token]; +        fprintf(s, "\n\t(comment \"%.*s\")", (int)comment.length, lex->input + comment.offset); +    } +    fprintf(s, "\n\t(raw \""); +    for (size_t i = 0; i < stmt->num_tokens; i++) { +        const struct token token = lex->tokbuf[stmt->first_token + i]; +        if (token.type == TT_NEWLINE) { +            break; +        } +        fprintf(s, "%.*s ", (int)token.length, lex->input + token.offset); +    } +    fprintf(s, "\"))\n"); +    return 0; +} + +static int fwrite_stmt(const struct stmt *const stmt, FILE *const stream) +{ +    const int res = fwrite(stmt, sizeof *stmt, 1, stream); +    assert(res == 1); +    return res; +} + +static size_t find_line_length(const char *const str) +{ +    for (size_t i = 0;; i++) { +        const char c = str[i]; +        if (c == '\n' || c == '\r' || c == '\000') { +            return i; +        } +    } +    return 0; +} + +static int pars_yield_error_str( +        struct pars *const self, const struct line_pos_info l, char *const str) +{ +    fprintf( +            stderr, +            "<stdin>:%lu:%lu: parsing error: expected %s, found '%s'\n", +            l.line_num + 1, +            l.column_num + 1, +            "<TODO>", +            str); +    free(str); +    const size_t line_length = find_line_length(self->lex->input + l.line_offset); +    char *const line = calloc(1, line_length + 1); +    memcpy(line, self->lex->input + l.line_offset, line_length); +    fprintf(stderr, "%5lu | %s\n", l.line_num + 1, line); +    free(line); +    fputs("      | ", stderr); +    for (size_t i = 0; i < l.column_num; i++) { +        if (self->lex->input[l.line_offset + i] == '\t') { +            fputc('\t', stderr); +        } else { +            fputc(' ', stderr); +        } +    } +    fputs("^\n", stderr); +    return ERR; +} + +static int pars_yield_error(struct pars *const self, const size_t token_id) +{ +    const struct token token = self->lex->tokbuf[token_id]; +    const struct line_pos_info l = +        lex_get_line_pos_info(self->lex, token.offset); +    char *const found = calloc(1, token.length + 10); +    snprintf(found, token.length + 1, "%s", self->lex->input + token.offset); +    return pars_yield_error_str(self, l, found); +} + +static int pars_yield_error_nesting( +        struct pars *const self, +        const size_t expression_start_token_id, +        const size_t expression_length_tokens)  {      (void) self; -    // TODO +    (void) expression_start_token_id; +    (void) expression_length_tokens;      return ERR;  } -static int pars_parse_label(struct pars *const self) +static int pars_yield_error_eof(struct pars *const self)  { -    const struct token label = self->lex->tokbuf[self->cur_tok_id++]; -    if (label.type != TT_IDENTIFIER) { -        return pars_yield_error(self); -    } -    // TODO -    return OK; +    (void) self; +    return ERR;  } -static int pars_parse_instr(struct pars *const self, const struct token mnemo) +static int pars_parse_direc( +        struct pars *const self, const struct token *const dot)  {      (void) self; -    (void) mnemo; -    // TODO +    (void) dot; +    return pars_yield_error(self, self->cur_tok_id); +} + +enum opsize get_opsize_from_specifier(const char size_specifier) +{ +    switch (tolower(size_specifier)) { +    case 's': return OPSIZE_S; +    case 'b': return OPSIZE_B; +    case 'w': return OPSIZE_W; +    case 'l': return OPSIZE_L; +    } +    return OPSIZE_NONE; +} + +static bool is_expression_token(const enum token_type type) +{ +    switch (type) { +    case TT_PLUS: return true; +    case TT_MINUS: return true; +    case TT_ASTERISK: return true; +    case TT_SLASH: return true; +    case TT_PERCENT: return true; +    case TT_LEFT_SHIFT: return true; +    case TT_RIGHT_SHIFT: return true; +    case TT_HASH: return true; +    case TT_BANG: return true; +    case TT_TILDE: return true; +    case TT_AMPERSAND: return true; +    case TT_PIPE: return true; +    case TT_CAP: return true; +    case TT_IDENTIFIER: return true; +    case TT_NUMDEC: return true; +    case TT_NUMOCT: return true; +    case TT_NUMHEX: return true; +    case TT_LPAREN: return true; +    case TT_RPAREN: return true; +    default: return false; +    } +    return false; +} + +static int pars_parse_arg( +        struct pars *const self, struct arg *const arg) +{ +    const size_t tokens_count = self->lex->tokbuf_size / +        (sizeof *self->lex->tokbuf); +    const size_t first_token_id = self->cur_tok_id; +    int nesting = 0; +    enum arg_type arg_type = ARG_EXPR; +    while (self->cur_tok_id < tokens_count) { +        const size_t token_id = self->cur_tok_id; // Peek +        const struct token token = self->lex->tokbuf[token_id]; +        if (token.type == TT_LPAREN) { +            nesting++; +        } else if (token.type == TT_RPAREN) { +            nesting--; +        } else if (is_expression_token(token.type)) { +            // TODO parse expression +        } else if (nesting > 0 && token.type == TT_COMMA) { +            // Comma inside parentheses is allowed +        } else { +            break; +        } +        self->cur_tok_id++; // Commit +    } +    if (nesting != 0) { +        return pars_yield_error_nesting( +                self, first_token_id, self->cur_tok_id - first_token_id); +    } +    if (first_token_id == self->cur_tok_id) { +        // Nothing has been parsed +        *arg = (struct arg){0}; +    } else { +        *arg = (struct arg){ +            .arg_type = arg_type, +            // TODO arg_contents +            .first_token = first_token_id, +            .num_tokens = self->cur_tok_id - first_token_id, +        }; +    }      return OK;  } -static int pars_parse_direc(struct pars *const self) +static int pars_yield_instruction( +        struct pars *const self, +        const size_t label_id, +        const size_t comment_id, +        const size_t mnemonic_id, +        const enum opsize opsize, +        const struct arg *const arg1, +        const struct arg *const arg2)  { -    const char *input = self->lex->input; -    const struct token direc = self->lex->tokbuf[self->cur_tok_id++]; -    if (direc.type != TT_IDENTIFIER) { -        return pars_yield_error(self); +    const struct token mnemonic_token = self->lex->tokbuf[mnemonic_id]; +    const enum mnemonic mnemonic = get_mnemonic_from_identifier( +            self->lex->input + mnemonic_token.offset, mnemonic_token.length); +    if (mnemonic == MN_NONE) { +        return pars_yield_error(self, mnemonic_id); +    } +    if (arg2) { +        assert(arg1);      } -    if (0 == strcmp(input + direc.offset, "def")) { -    } else if (0 == strcmp(input + direc.offset, "opt")) { -    } else if (0 == strcmp(input + direc.offset, "file")) { -    } else if (0 == strcmp(input + direc.offset, "text")) { -    } else if (0 == strcmp(input + direc.offset, "align")) { -    } else if (0 == strcmp(input + direc.offset, "globl")) { -    } else if (0 == strcmp(input + direc.offset, "ln")) { -    } else if (0 == strcmp(input + direc.offset, "long")) { -    } else if (0 == strcmp(input + direc.offset, "word")) { -    } else if (0 == strcmp(input + direc.offset, "byte")) { -    } else if (0 == strcmp(input + direc.offset, "bin")) { +    const enum args_count args_count = get_args_count_for_mnemonic(mnemonic); +    // Validate instruction arguments count +    switch (args_count) { +    case ARGS_COUNT_UNKNOWN: +        UNREACHABLE(); +        break; +    case ARGS_COUNT_0: +        if (arg1) { +            return pars_yield_error(self, arg1->first_token); +        } +        break; +    case ARGS_COUNT_0_1: +        if (arg2) { +            return pars_yield_error(self, arg2->first_token); +        } +        break; +    case ARGS_COUNT_0_1_2: +        break; +    case ARGS_COUNT_0_2: +        if (arg1 && !arg2) { +            return pars_yield_error(self, mnemonic_id); +        } +        break; +    case ARGS_COUNT_1: +        if (!arg1) { +            return pars_yield_error(self, mnemonic_id); +        } else if (arg2) { +            return pars_yield_error(self, arg2->first_token); +        } +        break; +    case ARGS_COUNT_1_2: +        if (!arg1) { +            return pars_yield_error(self, mnemonic_id); +        } +        break; +    case ARGS_COUNT_2: +        if (!arg1 || !arg2) { +            return pars_yield_error(self, mnemonic_id); +        } +        break;      } -    // TODO +    const size_t first_token_id = label_id ? label_id : mnemonic_id; +    const struct stmt stmt = { +        .type = ST_INSTRUCTION, +        .instruction = { +            .mnemonic = mnemonic, +            .opsize = opsize, +            .arg1 = arg1 ? *arg1 : (struct arg){0}, +            .arg2 = arg2 ? *arg2 : (struct arg){0}, +        }, +        .label_token = label_id, +        .comment_token = comment_id, +        .first_token = first_token_id, +        .num_tokens = self->cur_tok_id - first_token_id, +    }; +    fwrite_stmt(&stmt, self->stmttab_stream);      return OK;  } -static int pars_parse_instr_or_direc(struct pars *const self) +static int pars_parse_instruction_comment( +        struct pars *const self, +        const size_t label_id, +        const size_t mnemonic_id, +        const enum opsize opsize, +        const struct arg *const arg1, +        const struct arg *const arg2)  { -    struct token token; -    if (token.type == TT_DOT) { -        return pars_parse_direc(self); -    } else if (token.type == TT_IDENTIFIER) { -        return pars_parse_instr(self, token); -    } else { -        return pars_yield_error(self); +    size_t comment_id = 0; +    if (!pars_is_eof_reached(self)) { +        // Try parse comment +        const size_t token1_id = self->cur_tok_id; // Peek comment +        const struct token token1 = self->lex->tokbuf[token1_id]; +        const bool is_comment = token1.type == TT_COMMENT_ASTERISK || +            token1.type == TT_COMMENT_SEMICOLON; +        if (is_comment) { +            self->cur_tok_id++; // Commit comment +            comment_id = token1_id; +        } +        if (!pars_is_eof_reached(self)) { +            // Handle new line +            const size_t nl_id = self->cur_tok_id++; // Commit new line +            const struct token nl = self->lex->tokbuf[nl_id]; +            if (nl.type != TT_NEWLINE) { +                return pars_yield_error(self, nl_id); +            } +        } +    } +    return pars_yield_instruction( +            self, label_id, comment_id, mnemonic_id, opsize, arg1, arg2); +} + +static int pars_parse_instruction_args( +        struct pars *const self, +        const size_t label_id, +        const size_t mnemonic_id, +        const enum opsize opsize) +{ +    struct arg arg1, arg2; +    // Try parse first argument +    const int res1 = pars_parse_arg(self, &arg1); +    if (res1 != OK) { +        return res1; +    } +    if (arg1.arg_type == ARG_NONE) { +        return pars_parse_instruction_comment( +                self, label_id, mnemonic_id, opsize, NULL, NULL); +    } +    if (pars_is_eof_reached(self)) { +        return pars_yield_instruction( +                self, label_id, 0, mnemonic_id, opsize, &arg1, NULL); +    } +    const size_t comma_id = self->cur_tok_id; // Peek comma +    const struct token comma = self->lex->tokbuf[comma_id]; +    if (comma.type != TT_COMMA) { +        return pars_parse_instruction_comment( +                self, label_id, mnemonic_id, opsize, NULL, NULL); +    } +    self->cur_tok_id++; // Commit comma +    // Try parse second argument +    const int res2 = pars_parse_arg(self, &arg2); +    if (res2 != OK) { +        return res2; +    } +    if (pars_is_eof_reached(self)) { +        return pars_yield_instruction( +                self, label_id, 0, mnemonic_id, opsize, &arg1, &arg2); +    } +    // Finish parsing instruction, expect comment or newline +    return pars_parse_instruction_comment( +            self, label_id, mnemonic_id, opsize, &arg1, &arg2); +} + +static int pars_parse_instruction( +        struct pars *const self, +        const size_t label_id, +        const size_t mnemonic_id) +{ +    if (pars_is_eof_reached(self)) { +        return pars_yield_error_eof(self); +    } +    const size_t token2_id = self->cur_tok_id; // Peek +    const struct token token2 = self->lex->tokbuf[token2_id]; +    if (token2.type == TT_DOT) { +        self->cur_tok_id++; // Commit +        if (pars_is_eof_reached(self)) { +            return pars_yield_error_eof(self); +        } +        const size_t size_spec_id = self->cur_tok_id++; +        const struct token size_spec = self->lex->tokbuf[size_spec_id]; +        if (size_spec.type != TT_IDENTIFIER) { +            return pars_yield_error(self, size_spec_id); +        } +        // Size specifier +        if (size_spec.length != 1) { +            return pars_yield_error(self, size_spec_id); +        } +        const size_t opsize = +            get_opsize_from_specifier(self->lex->input[size_spec.offset]); +        if (opsize == OPSIZE_NONE) { +            return pars_yield_error(self, size_spec_id); +        } +        if (pars_is_eof_reached(self)) { +            return pars_yield_error_eof(self); +        } +        return pars_parse_instruction_args(self, label_id, mnemonic_id, opsize); +    } +    return pars_parse_instruction_args( +            self, label_id, mnemonic_id, OPSIZE_NONE); +} + +static int pars_parse_assignment( +        struct pars *const self, const size_t label_id, const size_t symbol_id) +{ +    (void) label_id; +    (void) symbol_id; +    return pars_yield_error(self, self->cur_tok_id); +} + +static int pars_yield_label_comment( +        struct pars *const self, const size_t label_id, const size_t comment_id) +{ +    if (label_id || comment_id) { +        const size_t first_token = label_id ? label_id : comment_id; +        struct stmt stmt = { +            .type = label_id ? ST_LABEL : ST_COMMENT, +            .label_token = label_id, +            .comment_token = comment_id, +            .first_token = first_token, +            .num_tokens = self->cur_tok_id - first_token, +        }; +        fwrite_stmt(&stmt, self->stmttab_stream);      }      return OK;  } -static int pars_parse_statement(struct pars *const self) +static int pars_parse_labeled_statement( +        struct pars *const self, const size_t label_id)  { -    const size_t tokens_count = self->lex->tokbuf_size / -        (sizeof *self->lex->tokbuf); -    const struct token token = self->lex->tokbuf[self->cur_tok_id++]; -    const bool is_comment = token.type == TT_COMMENT_ASTERISK || -        token.type == TT_COMMENT_SEMICOLON; -    if (token.type == TT_IDENTIFIER) { -        return pars_parse_label(self); -    } else if (!is_comment) { -        if (self->cur_tok_id < tokens_count) { -            const struct token nl = self->lex->tokbuf[self->cur_tok_id++]; -            assert(nl.type == TT_NEWLINE); +    const size_t token1_id = self->cur_tok_id++; +    const struct token token1 = self->lex->tokbuf[token1_id]; +    const bool is_comment = token1.type == TT_COMMENT_ASTERISK || +        token1.type == TT_COMMENT_SEMICOLON; +    if (is_comment) { +        return pars_yield_label_comment(self, label_id, token1_id); +    } else if (token1.type == TT_NEWLINE) { +        return pars_yield_label_comment(self, label_id, 0); +    } else if (token1.type == TT_IDENTIFIER) { +        if (pars_is_eof_reached(self)) { +            return pars_yield_error_eof(self);          } -        return OK; -    } else if (token.type == TT_NEWLINE) { -        return OK; +        const size_t token2_id = self->cur_tok_id; // Peek +        const struct token token2 = self->lex->tokbuf[token2_id]; +        if (!label_id && token2.type == TT_COLON) { +            self->cur_tok_id++; // Commit +            return pars_parse_labeled_statement(self, token1_id); +        } else if (token2.type == TT_EQ || token2.type == TT_EQ_DOUBLE) { +            self->cur_tok_id++; // Commit +            return pars_parse_assignment(self, label_id, token2_id); +        } else { +            return pars_parse_instruction(self, label_id, token1_id); +        } +    } else if (token1.type == TT_DOT) { +        return pars_parse_direc(self, &token1);      } -    return pars_yield_error(self); +    return pars_yield_error(self, token1_id); +} + +static int pars_parse_statement(struct pars *const self) +{ +    return pars_parse_labeled_statement(self, 0);  }  /** Run parser until the end of the input reached @@ -916,22 +1711,31 @@ static int pars_parse_statement(struct pars *const self)   */  static int pars_run(struct pars *const self)  { -    return OK;      const size_t tokens_count = self->lex->tokbuf_size /          (sizeof *self->lex->tokbuf); -    do { -        const int ret = pars_parse_statement(self); +    // Skip dummy token at position 0 +    self->cur_tok_id = 1; +    // Leave dummy statement at position 0 +    fwrite_stmt(&(struct stmt){0}, self->stmttab_stream); +    int ret = OK; +    while (self->cur_tok_id < tokens_count) { +        ret = pars_parse_statement(self);          if (ret != OK) { -            return ret; +            break;          } -    } while (self->cur_tok_id < tokens_count); -    return OK; +    } +    fflush(self->stmttab_stream); +    fflush(self->symtab_stream); +    fflush(self->symbuf_stream); +    return ret;  }  static void pars_destroy(struct pars *const self)  {      fclose(self->stmttab_stream);      free(self->stmttab); +    fclose(self->symtab_stream); +    free(self->symtab);      fclose(self->symbuf_stream);      free(self->symbuf);  } @@ -952,12 +1756,18 @@ static int assem_resolve(struct assem *const self)  static int assem_emit(struct assem *const self, FILE *const stream)  { -    if (TRACE_LEX) { -        const struct lex *const lex = self->pars->lex; -        for (size_t i = 0; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) { +    const struct lex *const lex = self->pars->lex; +    const struct pars *const pars = self->pars; +    if (TRACE_LEXER) { +        for (size_t i = 1; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) {              fprint_tok(lex->input, &lex->tokbuf[i], stream);          }      } +    if (TRACE_PARSER) { +        for (size_t i = 1; i < pars->stmttab_size / (sizeof *pars->stmttab); i++) { +            fprint_stmt(lex, pars->stmttab + i, stream); +        } +    }      return OK;  } | 
