/* SPDX-License-Identifier: Unlicense * * This program translates Sierra m68k assembly dialect to GNU AS m68k dialect. * * NOTE: Unicode is not supported, ASCII only. */ #include #include #include #include #include #include #include #include #ifndef TRACE_LEXER #define TRACE_LEXER 0 #endif #ifndef TRACE_PARSER #define TRACE_PARSER 0 #endif #if defined(__GNUC__) || defined(__clang__) #if !defined(NDEBUG) #define UNREACHABLE() assert(false) #else #define UNREACHABLE __builtin_unreachable #endif #else #define UNREACHABLE() #endif #define E_NIMPL "not implemented" #define E_UNREACH "unreachable code reached" #define E_EXPR "'(', ')', unary operator, binary operator, number or identifier" #define E_EXPR_NONREG "symbol that is not a register when parsing expression" #define E_EXPR_OPEN_WITH_ID "'-', '(', number or identifier" #define E_EXPR_CLOSE_WITH_ID "')', '+', '-', '+', '/' or identifier" #define E_EXPR_OPEN "'-', '(' or number" #define E_EXPR_CLOSE "')', '+', '-', '+' or '/'" #define E_DN "D0, ...D7" #define E_AN "A0, ...A7, SP" #define E_AN_DN E_AN " or " E_DN #define E_REGMASK_DELIM "'/' or '-'" #define E_REGMASK_TOKEN "'/', '-', " E_AN_DN #define E_REGMASK_ASCEND "registers in register mask range must be specified " \ "in ascending order" #define E_EA_PART "D0, ...D7, A0, ...A7, SP, PC or full expression" #define E_EA_PART_NOT_AN "D0, ...D7, PC or full expression" #define E_EA_PART_NOT_EXPR E_AN ", " E_DN ", or PC" #define E_EA_PART_DELIM "',' or ')'" #define E_EA_INVALID "invalid addressing mode" #define E_ARG "valid instruction argument" #define E_MNEMONIC "valid instruction mnemonic" #define E_INSN_SIZE_SPEC "'.s', '.b', '.w' or '.l'" #define E_ADDR_SIZE_SPEC "'.b', '.w' or '.l'" #define E_ADDR_INDIR_SIZE_SPEC "'.w' or '.l'" #define E_ADDR_INDIR_MULTIPLE_INDEX_REGS "multiple index registers specified" #define E_ARGS_COUNT "invalid arguments count" #define E_NL "new line, which is '\\n', '\\r\\n' or '\\r'" #define E_COMMENT_NL "';' or " E_NL #define E_INSTR_END "',', " E_COMMENT_NL #define E_LABELED_STMT "':', '=', '==' or " E_MNEMONIC #define E_DIRECTIVE "directive" #define E_STMT_BEGIN "label, " E_MNEMONIC ", " E_DIRECTIVE " or " E_NL #define E_UNKNOWN_DRC "unknown directive" #define E_NUM "number" #define E_STR "string" #define E_ID "identifier" #define E_ID_NUM_DOT E_ID ", " E_NUM " or '.'" #define E_NESTED_DEF "nested .def ... .endef blocks are illegal" #define E_NMATCH_ENDEF ".endef directive without matching .def" #define E_MULTIPLE_VAL "multiple .val directives specified" #define E_MULTIPLE_SCL "multiple .scl directives specified" #define E_MULTIPLE_TYPE "multiple .type directives specified" #define E_MAX_NESTING "maximum expression nesting level reached" #define ERR 0 #define OK 1 #define CONTINUE 2 #define BCC_S_MAX_BACKWARDS 126 #define EXPR_NESTING_MAX 10 #define PARS_EXPR_FLAG_ALLOW_ID 1 // These are from Sierra's FILE_FMT.H #define C_EFCN -1 /* physical end of function */ #define DT_FCN 2 /* function */ enum token_type { TT_NONE = 0, TT_NEWLINE, TT_ESCAPE, TT_DOT, TT_COMMA, TT_PLUS, TT_MINUS, TT_ASTERISK, TT_SLASH, TT_EQ, TT_EQ_DOUBLE, TT_COLON, TT_PERCENT, TT_LSHIFT, TT_RSHIFT, TT_HASH, TT_BANG, TT_TILDE, TT_AMPERSAND, TT_PIPE, TT_CAP, TT_STRING, TT_ID, TT_DOT_ID, TT_NUMDEC, TT_NUMOCT, TT_NUMHEX, TT_LPAREN, TT_RPAREN, TT_LBRACKET, TT_RBRACKET, TT_RBRACE, TT_LBRACE, TT_COMMENT_ASTERISK, TT_COMMENT_SEMICOLON, }; struct token { enum token_type type; uint32_t value; ///< For TT_NUMOCT, TT_NUMDEC, TT_NUMHEX size_t offset; size_t length; }; enum lex_error { LE_NONE = 0, LE_SOME, }; enum lex_state { LS_FREE = 0, LS_CR, LS_LSHIFT, LS_RSHIFT, LS_EQ, LS_DOT, LS_DOT_ID, LS_ID, LS_NUMOCTHEX, LS_NUMOCT, LS_NUMHEX, LS_NUMDEC, LS_STRING, LS_STRING_ESC, LS_COMMENT_ASTERISK, LS_COMMENT_SEMICOLON, LS_ERROR, LS_EOF, }; struct line_pos_info { unsigned long line_num; unsigned long column_num; unsigned long line_offset; }; struct lex { // State variables enum lex_state state; enum lex_error error; uint32_t current_number_value; size_t cursor; size_t tok_offset; bool inside_line; // Input data buffer FILE *input_stream; char *input; size_t input_size; // Tokens table FILE *tokbuf_stream; struct token *tokbuf; size_t tokbuf_size; size_t tokens_count; }; enum stmt_type { ST_NONE = 0, ST_LABEL, ST_INSTRUCTION, ST_ASSIGNMENT, ST_COMMENT, ST_DIRECTIVE, ST_META_SAT, ///< Not a real statement, just an accumulation of .def .endef block }; enum mnemonic { MN_NONE = 0, MN_ABCD, MN_ADD, MN_ADDA, MN_ADDI, MN_ADDQ, MN_ADDX, MN_AND, MN_ANDI, MN_ASL, MN_ASR, MN_BRA, MN_BSR, MN_BCC, MN_BCS, MN_BEQ, MN_BGE, MN_BGT, MN_BHI, MN_BLE, MN_BLS, MN_BLT, MN_BMI, MN_BNE, MN_BPL, MN_BVC, MN_BVS, MN_BCHG, MN_BCLR, MN_BSET, MN_BTST, MN_CHK, MN_CLR, MN_CMP, MN_CMPA, MN_CMPI, MN_CMPM, MN_DBT, MN_DBF, MN_DBCC, MN_DBCS, MN_DBEQ, MN_DBGE, MN_DBGT, MN_DBHI, MN_DBLE, MN_DBLS, MN_DBLT, MN_DBMI, MN_DBNE, MN_DBPL, MN_DBVC, MN_DBVS, MN_DIVU, MN_DIVS, MN_EOR, MN_EORI, MN_EXG, MN_EXT, MN_ILLEGAL, MN_JMP, MN_JSR, MN_LEA, MN_LINK, MN_LSL, MN_LSR, MN_MOVE, MN_MOVEA, MN_MOVEM, MN_MOVEP, MN_MOVEQ, MN_MULS, MN_MULU, MN_NBCD, MN_NEG, MN_NEGX, MN_NOP, MN_NOT, MN_OR, MN_ORI, MN_PEA, MN_RESET, MN_ROL, MN_ROR, MN_ROXL, MN_ROXR, MN_RTE, MN_RTR, MN_RTS, MN_SBCD, MN_ST, MN_SF, MN_SCC, MN_SCS, MN_SEQ, MN_SGE, MN_SGT, MN_SHI, MN_SLE, MN_SLS, MN_SLT, MN_SMT, MN_SNE, MN_SPL, MN_SVC, MN_SVS, MN_STOP, MN_SUB, MN_SUBA, MN_SUBI, MN_SUBQ, MN_SUBX, MN_SWAP, MN_TAS, MN_TRAP, MN_TRAPV, MN_TST, MN_UNLK, MNEMONICS_COUNT, }; enum directive_type { DT_NONE = 0, DT_ALIGN, DT_ASCII, DT_BIN, DT_BSECTION, DT_BSS, DT_BYTE, DT_CMNT, DT_COMM, DT_DATA, DT_DEF, DT_DIM, DT_DOUBLE, DT_DSECTION, DT_ECHO, DT_ELIFDEF, DT_ELSE, DT_END, DT_ENDC, DT_ENDEF, DT_ENDIF, DT_ENDS, DT_EXTEND, DT_EXTERN, DT_FILE, DT_FILL, DT_FLOAT, DT_FPDATA, DT_GLOBL, DT_IFDEF, DT_IFNDEF, DT_INCLUDE, DT_LCOMM, DT_LINE, DT_LN, DT_LONG, DT_OPT, DT_ORG, DT_PACKED, DT_PAGE, DT_REORG, DT_SCL, DT_SECTION, DT_SHORT, DT_SINGLE, DT_SIZE, DT_SPACE, DT_STRUCT, DT_TAG, DT_TEXT, DT_TSECTION, DT_TYPE, DT_VAL, DT_WORD, DT_XDEF, DT_XREF, DIRECTIVES_COUNT, }; enum opsize { OPSIZE_NONE = 0, OPSIZE_S, OPSIZE_B, OPSIZE_W, OPSIZE_L, }; enum arg_type { ARG_NONE = 0, ARG_DN, ARG_AN, ARG_AN_ADDR, ARG_AN_ADDR_INCR, ARG_AN_ADDR_DECR, ARG_AN_ADDR_16, ARG_AN_ADDR_8_XI, ARG_ADDR_WORD, ARG_ADDR_LONG, ARG_ADDR_UNSPEC, ARG_PC_ADDR_16, ARG_PC_ADDR_8_XI, ARG_IMMEDIATE, ARG_REGMASK, ARG_SR, ARG_CCR, ARG_USP, }; enum args_count { ARGS_COUNT_UNKNOWN = 0, ARGS_COUNT_0, ARGS_COUNT_1, ARGS_COUNT_1_2, ARGS_COUNT_2, }; struct expr { size_t first_token, num_tokens; int32_t value; bool value_is_resolved; }; struct arg { enum arg_type type; uint16_t regmask; ///< For regmask (movem only) uint8_t xn; ///< For Dn, An, (An), -(An), (An)+, (d16,An) uint8_t xi; ///< For (d8,An,Xi) and (d8,PC,Xi), it has 0x8 mask set if An enum opsize briefext_size; struct expr expr; size_t first_token, num_tokens; ///< Argument tokens span }; struct instruction { enum mnemonic mnemonic; enum opsize opsize; struct arg arg1, arg2; }; struct directive { enum directive_type type; size_t name_token; ///< Directive self first token size_t first_token, num_tokens; ///< Directive arguments tokens span }; /// Symbol Attribute Table (SAT, a `.def ... .endef` block) struct sat { struct expr def_arg; struct expr val_arg; struct expr scl_arg; struct expr type_arg; }; struct stmt { enum stmt_type type; uint32_t addr; union { struct instruction instruction; struct directive directive; struct sat sat; }; size_t label_token; size_t first_token, num_tokens; // Statement tokens span, may be NULL size_t comment_token; }; enum pars_error { PE_NONE = 0, PE_LEX, PE_SOME, }; enum reg_type { REG_NONE = 0, REG_DN, REG_AN, REG_PC, REG_SR, REG_CCR, REG_USP, }; enum recognized_token_type { RTT_NONE = 0, RTT_REG, // TT_ID RTT_NUMBER, // TT_NUMHEX, TT_NUMOCT and TT_NUMDEC }; struct token_recognition { enum recognized_token_type type; union { struct { enum reg_type reg; uint8_t reg_num; }; ///< For RTT_REG int32_t number; ///< For TT_ID size_t symbol_id; ///< For TT_ID and TT_DOT_ID, see (struct pars).symtab }; }; struct sym { size_t stmt_id; uint32_t addr; }; struct pars { const struct lex *lex; // State size_t cur_tok_id; enum pars_error error; bool in_sat; ///< Indicates whether inside `.def ... .endef` block or not struct sat sat; // Statement table FILE *stmttab_stream; struct stmt *stmttab; size_t stmttab_size; // Symbol table FILE *symtab_stream; struct sym *symtab; size_t symtab_size; uint32_t addr_cursor; ///< Current address relative to parsed instructions // Symbol buffer for symbol table FILE *symbuf_stream; char *symbuf; size_t symbuf_size; }; struct assem { const struct pars *pars; struct sat sat; }; static int pars_parse_arg_inside_parens( struct pars *const self, struct arg *const arg); const char *const g_escape_table[256] = { "\\x00", "\\x01", "\\x02", "\\x03", "\\x04", "\\x05", "\\x06", "\\x07", "\\x08", "\\t", "\\n", "\\x0b", "\\x0c", "\\r", "\\x0e", "\\x0f", "\\x10", "\\x11", "\\x12", "\\x13", "\\x14", "\\x15", "\\x16", "\\x17", "\\x18", "\\x19", "\\x1a", "\\x1b", "\\x1c", "\\x1d", "\\x1e", "\\x1f", " ", "!", "\\\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "\\<", "=", "\\>", "?", "@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\\\", "]", "^", "_", "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "{", "|", "}", "~", "\\x7f", "\\x80", "\\x81", "\\x82", "\\x83", "\\x84", "\\x85", "\\x86", "\\x87", "\\x88", "\\x89", "\\x8a", "\\x8b", "\\x8c", "\\x8d", "\\x8e", "\\x8f", "\\x90", "\\x91", "\\x92", "\\x93", "\\x94", "\\x95", "\\x96", "\\x97", "\\x98", "\\x99", "\\x9a", "\\x9b", "\\x9c", "\\x9d", "\\x9e", "\\x9f", "\\xa0", "\\xa1", "\\xa2", "\\xa3", "\\xa4", "\\xa5", "\\xa6", "\\xa7", "\\xa8", "\\xa9", "\\xaa", "\\xab", "\\xac", "\\xad", "\\xae", "\\xaf", "\\xb0", "\\xb1", "\\xb2", "\\xb3", "\\xb4", "\\xb5", "\\xb6", "\\xb7", "\\xb8", "\\xb9", "\\xba", "\\xbb", "\\xbc", "\\xbd", "\\xbe", "\\xbf", "\\xc0", "\\xc1", "\\xc2", "\\xc3", "\\xc4", "\\xc5", "\\xc6", "\\xc7", "\\xc8", "\\xc9", "\\xca", "\\xcb", "\\xcc", "\\xcd", "\\xce", "\\xcf", "\\xd0", "\\xd1", "\\xd2", "\\xd3", "\\xd4", "\\xd5", "\\xd6", "\\xd7", "\\xd8", "\\xd9", "\\xda", "\\xdb", "\\xdc", "\\xdd", "\\xde", "\\xdf", "\\xe0", "\\xe1", "\\xe2", "\\xe3", "\\xe4", "\\xe5", "\\xe6", "\\xe7", "\\xe8", "\\xe9", "\\xea", "\\xeb", "\\xec", "\\xed", "\\xee", "\\xef", "\\xf0", "\\xf1", "\\xf2", "\\xf3", "\\xf4", "\\xf5", "\\xf6", "\\xf7", "\\xf8", "\\xf9", "\\xfa", "\\xfb", "\\xfc", "\\xfd", "\\xfe", }; const struct mnemonic_meta { const char *str; enum args_count args_count; } g_mnemmonics[MNEMONICS_COUNT] = { { "none", ARGS_COUNT_0 }, { "abcd", ARGS_COUNT_2 }, { "add", ARGS_COUNT_2 }, { "adda", ARGS_COUNT_2 }, { "addi", ARGS_COUNT_2 }, { "addq", ARGS_COUNT_2 }, { "addx", ARGS_COUNT_2 }, { "and", ARGS_COUNT_2 }, { "andi", ARGS_COUNT_2 }, { "asl", ARGS_COUNT_1_2 }, { "asr", ARGS_COUNT_1_2 }, { "bra", ARGS_COUNT_1 }, { "bsr", ARGS_COUNT_1 }, { "bcc", ARGS_COUNT_1 }, { "bcs", ARGS_COUNT_1 }, { "beq", ARGS_COUNT_1 }, { "bge", ARGS_COUNT_1 }, { "bgt", ARGS_COUNT_1 }, { "bhi", ARGS_COUNT_1 }, { "ble", ARGS_COUNT_1 }, { "bls", ARGS_COUNT_1 }, { "blt", ARGS_COUNT_1 }, { "bmi", ARGS_COUNT_1 }, { "bne", ARGS_COUNT_1 }, { "bpl", ARGS_COUNT_1 }, { "bvc", ARGS_COUNT_1 }, { "bvs", ARGS_COUNT_1 }, { "bchg", ARGS_COUNT_2 }, { "bclr", ARGS_COUNT_2 }, { "bset", ARGS_COUNT_2 }, { "btst", ARGS_COUNT_2 }, { "chk", ARGS_COUNT_2 }, { "clr", ARGS_COUNT_1 }, { "cmp", ARGS_COUNT_2 }, { "cmpa", ARGS_COUNT_2 }, { "cmpi", ARGS_COUNT_2 }, { "cmpm", ARGS_COUNT_2 }, { "dbt", ARGS_COUNT_2 }, { "dbf", ARGS_COUNT_2 }, { "dbcc", ARGS_COUNT_2 }, { "dbcs", ARGS_COUNT_2 }, { "dbeq", ARGS_COUNT_2 }, { "dbge", ARGS_COUNT_2 }, { "dbgt", ARGS_COUNT_2 }, { "dbhi", ARGS_COUNT_2 }, { "dble", ARGS_COUNT_2 }, { "dbls", ARGS_COUNT_2 }, { "dblt", ARGS_COUNT_2 }, { "dbmi", ARGS_COUNT_2 }, { "dbne", ARGS_COUNT_2 }, { "dbpl", ARGS_COUNT_2 }, { "dbvc", ARGS_COUNT_2 }, { "dbvs", ARGS_COUNT_2 }, { "divu", ARGS_COUNT_2 }, { "divs", ARGS_COUNT_2 }, { "eor", ARGS_COUNT_2 }, { "eori", ARGS_COUNT_2 }, { "exg", ARGS_COUNT_2 }, { "ext", ARGS_COUNT_1 }, { "illegal", ARGS_COUNT_0 }, { "jmp", ARGS_COUNT_1 }, { "jsr", ARGS_COUNT_1 }, { "lea", ARGS_COUNT_2 }, { "link", ARGS_COUNT_2 }, { "lsl", ARGS_COUNT_1_2 }, { "lsr", ARGS_COUNT_1_2 }, { "move", ARGS_COUNT_2 }, { "movea", ARGS_COUNT_2 }, { "movem", ARGS_COUNT_2 }, { "movep", ARGS_COUNT_2 }, { "moveq", ARGS_COUNT_2 }, { "muls", ARGS_COUNT_2 }, { "mulu", ARGS_COUNT_2 }, { "nbcd", ARGS_COUNT_1 }, { "neg", ARGS_COUNT_1 }, { "negx", ARGS_COUNT_1 }, { "nop", ARGS_COUNT_0 }, { "not", ARGS_COUNT_1 }, { "or", ARGS_COUNT_2 }, { "ori", ARGS_COUNT_2 }, { "pea", ARGS_COUNT_1 }, { "reset", ARGS_COUNT_0 }, { "rol", ARGS_COUNT_1_2 }, { "ror", ARGS_COUNT_1_2 }, { "roxl", ARGS_COUNT_1_2 }, { "roxr", ARGS_COUNT_1_2 }, { "rte", ARGS_COUNT_0 }, { "rtr", ARGS_COUNT_0 }, { "rts", ARGS_COUNT_0 }, { "sbcd", ARGS_COUNT_2 }, { "st", ARGS_COUNT_1 }, { "sf", ARGS_COUNT_1 }, { "scc", ARGS_COUNT_1 }, { "scs", ARGS_COUNT_1 }, { "seq", ARGS_COUNT_1 }, { "sge", ARGS_COUNT_1 }, { "sgt", ARGS_COUNT_1 }, { "shi", ARGS_COUNT_1 }, { "sle", ARGS_COUNT_1 }, { "sls", ARGS_COUNT_1 }, { "slt", ARGS_COUNT_1 }, { "smt", ARGS_COUNT_1 }, { "sne", ARGS_COUNT_1 }, { "spl", ARGS_COUNT_1 }, { "svc", ARGS_COUNT_1 }, { "svs", ARGS_COUNT_1 }, { "stop", ARGS_COUNT_1 }, { "sub", ARGS_COUNT_2 }, { "suba", ARGS_COUNT_2 }, { "subi", ARGS_COUNT_2 }, { "subq", ARGS_COUNT_2 }, { "subx", ARGS_COUNT_2 }, { "swap", ARGS_COUNT_1 }, { "tas", ARGS_COUNT_1 }, { "trap", ARGS_COUNT_1 }, { "trapv", ARGS_COUNT_0 }, { "tst", ARGS_COUNT_1 }, { "unlk", ARGS_COUNT_1 }, }; static int pars_directive_skip(struct pars *, enum directive_type, size_t); static int pars_directive_handler_def(struct pars *, enum directive_type, size_t); static int pars_directive_handler_endef(struct pars *, enum directive_type, size_t); static int pars_directive_handler_scl(struct pars *, enum directive_type, size_t); static int pars_directive_handler_type(struct pars *, enum directive_type, size_t); static int pars_directive_handler_val(struct pars *, enum directive_type, size_t); const struct directive_description { const char *str; int (*handler)(struct pars *, enum directive_type, size_t lable_id); } g_directives[DIRECTIVES_COUNT] = { { "", NULL, }, { "align", pars_directive_skip, }, { "ascii", pars_directive_skip, }, { "bin", pars_directive_skip, }, { "bsection", pars_directive_skip, }, { "bss", pars_directive_skip, }, { "byte", pars_directive_skip, }, { "cmnt", pars_directive_skip, }, { "comm", pars_directive_skip, }, { "data", pars_directive_skip, }, { "def", pars_directive_handler_def, }, { "dim", pars_directive_skip, }, { "double", pars_directive_skip, }, { "dsection", pars_directive_skip, }, { "echo", pars_directive_skip, }, { "elifdef", pars_directive_skip, }, { "else", pars_directive_skip, }, { "end", pars_directive_skip, }, { "endc", pars_directive_skip, }, { "endef", pars_directive_handler_endef, }, { "endif", pars_directive_skip, }, { "ends", pars_directive_skip, }, { "extend", pars_directive_skip, }, { "extern", pars_directive_skip, }, { "file", pars_directive_skip, }, { "fill", pars_directive_skip, }, { "float", pars_directive_skip, }, { "fpdata", pars_directive_skip, }, { "globl", pars_directive_skip, }, { "ifdef", pars_directive_skip, }, { "ifndef", pars_directive_skip, }, { "include", pars_directive_skip, }, { "lcomm", pars_directive_skip, }, { "line", pars_directive_skip, }, { "ln", pars_directive_skip, }, { "long", pars_directive_skip, }, { "opt", pars_directive_skip, }, { "org", pars_directive_skip, }, { "packed", pars_directive_skip, }, { "page", pars_directive_skip, }, { "reorg", pars_directive_skip, }, { "scl", pars_directive_handler_scl, }, { "section", pars_directive_skip, }, { "short", pars_directive_skip, }, { "single", pars_directive_skip, }, { "size", pars_directive_skip, }, { "space", pars_directive_skip, }, { "struct", pars_directive_skip, }, { "tag", pars_directive_skip, }, { "text", pars_directive_skip, }, { "tsection", pars_directive_skip, }, { "type", pars_directive_handler_type, }, { "val", pars_directive_handler_val, }, { "word", pars_directive_skip, }, { "xdef", pars_directive_skip, }, { "xref", pars_directive_skip, }, }; static bool should_be_escaped(const int c) { return c < ' ' || c == '"' || c == '\\' || c == '<' || c == '>' || c > '~'; } static bool is_oct(const int c) { return c >= '0' && c <= '7'; } static bool is_dec(const int c) { return c >= '0' && c <= '9'; } static bool is_hex(const int c) { return is_dec(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } static bool is_alphabetic(const int c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } static bool is_alphanum(const int c) { return is_dec(c) || is_alphabetic(c); } static int printed_size(const char c) { if (c < ' ' || c > '~') { return sizeof("\\x00")-1; } if (c == '"' || c == '\\') { return sizeof("\\\\")-1; } return 1; } static bool token_is_number(const enum token_type type) { return type == TT_NUMHEX || type == TT_NUMDEC || type == TT_NUMOCT; } static bool token_is_binary_operator(const enum token_type type) { return type == TT_PLUS || type == TT_MINUS || type == TT_ASTERISK || type == TT_SLASH || type == TT_PERCENT || type == TT_RSHIFT || type == TT_RSHIFT || type == TT_AMPERSAND || type == TT_CAP || type == TT_PIPE || type == TT_BANG; } static bool token_is_regmask_delimiter(const enum token_type type) { return type == TT_SLASH || type == TT_MINUS; } static int fprint_string_escaped( const char *const str, const size_t length, FILE *const stream) { int written = 0; for (size_t i = 0; i < length; i++, written += printed_size(str[i])) { if (should_be_escaped(str[i])) { fputs(g_escape_table[(unsigned char)str[i]], stream); } else { fputc(str[i], stream); } } return written; } static int hex_digit_to_int(char c) { if (c >= 'a' && c <= 'f') { return c - 'a'; } if (c >= 'A' && c <= 'F') { return c - 'A'; } return c - '0'; } static const char *token_type_to_string(const enum token_type type) { switch (type) { case TT_NONE: return "NONE"; case TT_NEWLINE: return "NEWLINE"; case TT_ESCAPE: return "ESCAPE"; case TT_DOT: return "DOT"; case TT_COMMA: return "COMMA"; case TT_PLUS: return "PLUS"; case TT_MINUS: return "MINUS"; case TT_ASTERISK: return "ASTERISK"; case TT_SLASH: return "SLASH"; case TT_EQ: return "EQ"; case TT_EQ_DOUBLE: return "EQ_DOUBLE"; case TT_COLON: return "COLON"; case TT_PERCENT: return "PERCENT"; case TT_LSHIFT: return "LSHIFT"; case TT_RSHIFT: return "RSHIFT"; case TT_HASH: return "HASH"; case TT_BANG: return "BANG"; case TT_TILDE: return "TILDE"; case TT_AMPERSAND: return "AMPERSAND"; case TT_PIPE: return "PIPE"; case TT_CAP: return "CAP"; case TT_STRING: return "STRING"; case TT_ID: return "ID"; case TT_DOT_ID: return "DOT_ID"; case TT_NUMDEC: return "NUMDEC"; case TT_NUMOCT: return "NUMOCT"; case TT_NUMHEX: return "NUMHEX"; case TT_LPAREN: return "LPAREN"; case TT_RPAREN: return "RPAREN"; case TT_LBRACKET: return "LBRACKET"; case TT_RBRACKET: return "RBRACKET"; case TT_LBRACE: return "LBRACE"; case TT_RBRACE: return "RBRACE"; case TT_COMMENT_ASTERISK: return "COMMENT"; case TT_COMMENT_SEMICOLON: return "COMMENT"; } UNREACHABLE(); return "_UNKNOWN"; } static int fprint_token_debug(const char *const input, struct token *token, FILE *const stream) { int res = fprintf(stream, "%s<", token_type_to_string(token->type)); if (res == -1) { return -1; } int written = res; res = fprint_string_escaped(input + token->offset, token->length, stream); if (res == -1) { return -1; } written += res; res = fputs(">\n", stream); if (res == -1) { return -1; } written += res; return written; } static int fwrite_token(const struct token *const token, FILE *const stream) { const int res = fwrite(token, sizeof *token, 1, stream); assert(res == 1); return res; } static int lex_init(struct lex *const self) { *self = (struct lex){ .input_stream = open_memstream(&self->input, &self->input_size), .tokbuf_stream = open_memstream( (char **)&self->tokbuf, &self->tokbuf_size), }; assert(self->input_stream != NULL); assert(self->tokbuf_stream != NULL); // Place a dummy token at 0 index, so first real token will be at index 1. // This is needed for parser, so it can use zero to indicate absence of // token. fwrite_token(&(struct token){TT_NONE}, self->tokbuf_stream); return OK; } static void lex_yield_token(struct lex *const self, const struct token *const token) { self->inside_line = (token->type != TT_NEWLINE) && (token->type != TT_ESCAPE); fwrite_token(token, self->tokbuf_stream); self->tokens_count++; } static const char *lex_state_error_string( const enum lex_state state, const bool inside_line) { if (!inside_line) { assert(state == LS_FREE); return "'*', ';', '0', '[1-9]', '[a-zA-Z_]', ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; } switch (state) { case LS_FREE: return "';', '0', '[1-9]', '[a-zA-Z_]', ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_NUMOCTHEX: return "';', '[0-7]', [xX], ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_NUMOCT: return "';', '[0-7]' , ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_NUMHEX: return "';', '[0-9a-fA-F]' , ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_NUMDEC: return "';', '[0-9]' , ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_LSHIFT: return "'<'"; case LS_RSHIFT: return "'>'"; case LS_CR: case LS_EQ: case LS_DOT: case LS_DOT_ID: case LS_ID: case LS_STRING: case LS_STRING_ESC: case LS_COMMENT_ASTERISK: case LS_COMMENT_SEMICOLON: case LS_ERROR: case LS_EOF: UNREACHABLE(); break; } return "???"; } static struct line_pos_info lex_get_line_pos_info( const struct lex *const self, const size_t cursor) { struct line_pos_info l = {0, 0, 0}; bool cr = false; for (size_t i = 0; i < cursor; i++) { const char c = self->input[i]; if (c == '\r') { cr = true; l.line_offset = i + 1; l.line_num++; l.column_num = 0; } else if (c == '\n') { if (!cr) { l.line_num++; } cr = false; l.line_offset = i + 1; l.column_num = 0; } else { cr = false; l.column_num++; } } return l; } static size_t find_line_length(const char *const str) { for (size_t i = 0;; i++) { const char c = str[i]; if (c == '\n' || c == '\r' || c == '\000') { return i; } } return 0; } static int lex_yield_error(struct lex *const self, const int c) { fflush(self->input_stream); const size_t cursor = self->cursor; const struct line_pos_info l = lex_get_line_pos_info(self, cursor); { // Read out the rest of the line int c; do { c = getc(stdin); const char c_char = (c == EOF) ? 0 : c; fwrite(&c_char, sizeof c_char, 1, self->input_stream); } while (c != EOF && c != '\n' && c != '\r'); fflush(self->input_stream); } const unsigned char c_char = (c == EOF) ? 0 : c; fprintf( stderr, ":%lu:%lu: lexing error: expected %s, found '", l.line_num + 1, l.column_num + 1, lex_state_error_string(self->state, self->inside_line)); fputs(g_escape_table[c_char], stderr); fputs("'\n", stderr); const char *const line = self->input + l.line_offset; const size_t line_length = find_line_length(line); fprintf(stderr, "%5lu | %.*s\n", l.line_num, (int)line_length, line); fputs(" | ", stderr); for (size_t i = 0; i < l.column_num; i++) { if (self->input[l.line_offset + i] == '\t') { fputc('\t', stderr); } else { fputc(' ', stderr); } } fputs("^\n", stderr); fprintf(stderr, ": %lu bytes parsed\n", cursor); self->state = LS_ERROR; return ERR; } static int lex_handle_next(struct lex *const self, const int c) { switch (self->state) { case LS_FREE: if (is_alphabetic(c) || c == '_') { self->tok_offset = self->cursor; self->state = LS_ID; } else if (c == '0') { self->tok_offset = self->cursor; self->current_number_value = 0; self->state = LS_NUMOCTHEX; } else if (is_dec(c)) { self->tok_offset = self->cursor; self->current_number_value = c - '0'; self->state = LS_NUMDEC; } else if (c == '@') { self->tok_offset = self->cursor; self->current_number_value = 0; self->state = LS_NUMOCT; } else if (c == '$') { self->tok_offset = self->cursor; self->current_number_value = 0; self->state = LS_NUMHEX; } else if (c == '"') { self->tok_offset = self->cursor; self->state = LS_STRING; } else if (c == ';') { self->tok_offset = self->cursor; self->state = LS_COMMENT_SEMICOLON; } else if (c == '<') { self->tok_offset = self->cursor; self->state = LS_LSHIFT; } else if (c == '>') { self->tok_offset = self->cursor; self->state = LS_RSHIFT; } else if (c == '.') { self->tok_offset = self->cursor; self->state = LS_DOT; } else if (c == ',') { lex_yield_token(self, &(struct token){TT_COMMA, 0, self->cursor, 1}); } else if (c == '(') { lex_yield_token(self, &(struct token){TT_LPAREN, 0, self->cursor, 1}); } else if (c == ')') { lex_yield_token(self, &(struct token){TT_RPAREN, 0, self->cursor, 1}); } else if (c == '[') { lex_yield_token(self, &(struct token){TT_LBRACKET, 0, self->cursor, 1}); } else if (c == ']') { lex_yield_token(self, &(struct token){TT_RBRACKET, 0, self->cursor, 1}); } else if (c == '{') { lex_yield_token(self, &(struct token){TT_LBRACE, 0, self->cursor, 1}); } else if (c == '{') { lex_yield_token(self, &(struct token){TT_RBRACE, 0, self->cursor, 1}); } else if (c == '+') { lex_yield_token(self, &(struct token){TT_PLUS, 0, self->cursor, 1}); } else if (c == '-') { lex_yield_token(self, &(struct token){TT_MINUS, 0, self->cursor, 1}); } else if (c == '*') { if (self->inside_line) { lex_yield_token( self, &(struct token){TT_ASTERISK, 0, self->cursor, 1}); } else { self->tok_offset = self->cursor; self->state = LS_COMMENT_ASTERISK; } } else if (c == '/') { lex_yield_token(self, &(struct token){TT_SLASH, 0, self->cursor, 1}); } else if (c == '=') { self->tok_offset = self->cursor; self->state = LS_EQ; } else if (c == ':') { lex_yield_token(self, &(struct token){TT_COLON, 0, self->cursor, 1}); } else if (c == '%') { lex_yield_token(self, &(struct token){TT_PERCENT, 0, self->cursor, 1}); } else if (c == '#') { lex_yield_token(self, &(struct token){TT_HASH, 0, self->cursor, 1}); } else if (c == '!') { lex_yield_token(self, &(struct token){TT_BANG, 0, self->cursor, 1}); } else if (c == '~') { lex_yield_token(self, &(struct token){TT_TILDE, 0, self->cursor, 1}); } else if (c == '&') { lex_yield_token(self, &(struct token){TT_AMPERSAND, 0, self->cursor, 1}); } else if (c == '|') { lex_yield_token(self, &(struct token){TT_PIPE, 0, self->cursor, 1}); } else if (c == '^') { lex_yield_token(self, &(struct token){TT_CAP, 0, self->cursor, 1}); } else if (c == '\r') { self->tok_offset = self->cursor; self->state = LS_CR; } else if (c == '\n') { lex_yield_token(self, &(struct token){TT_NEWLINE, 0, self->cursor, 1}); } else if (c == '\\') { lex_yield_token(self, &(struct token){TT_ESCAPE, 0, self->cursor, 1}); } else if (c == ' ' || c == '\t') { // ignore spaces and tabs } else if (c == EOF) { self->state = LS_EOF; } else if (c == '\x1a') { // Ignore "End of file" character } else { return lex_yield_error(self, c); } break; case LS_CR: // Accumulate CRLF into single token { const size_t length = c == '\n' ? 2 : 1; // 2 for CRLF, 1 for just CR const struct token token = {TT_NEWLINE, 0, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; if (c != '\n') { // It is just CR, handle this char in LS_FREE state then return lex_handle_next(self, c); } } break; case LS_LSHIFT: if (c == '<') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_LSHIFT, 0, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; } else { return lex_yield_error(self, c); } break; case LS_RSHIFT: if (c == '>') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_RSHIFT, 0, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; } else { return lex_yield_error(self, c); } break; case LS_EQ: { const size_t length = (c == '=') ? 2 : 1; const enum token_type type = (c == '=') ? TT_EQ_DOUBLE : TT_EQ; const struct token token = {type, 0, self->tok_offset, length}; lex_yield_token(self, &token); } self->state = LS_FREE; if (c != '=') { // It is just single eq "=", handle this char in LS_FREE state then return lex_handle_next(self, c); } break; case LS_DOT: if (is_alphanum(c) || c == '_') { self->state = LS_DOT_ID; } else { lex_yield_token(self, &(struct token){TT_DOT, 0, self->tok_offset, 1}); self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_DOT_ID: if (!is_alphanum(c) && c != '_') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_DOT_ID, 0, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_ID: if (!is_alphanum(c) && c != '_') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_ID, 0, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_NUMOCTHEX: if (c == 'x' || c == 'X') { self->state = LS_NUMHEX; } else if (is_oct(c)) { self->state = LS_NUMOCT; return lex_handle_next(self, c); } else if (is_alphabetic(c) || c == '_') { return lex_yield_error(self, c); } else { assert((self->cursor - self->tok_offset) == 1); const struct token token = {TT_NUMDEC, 0, self->tok_offset, 1}; lex_yield_token(self, &token); // It was just zero, handle this char in LS_FREE state then self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_NUMOCT: if (is_oct(c)) { self->current_number_value <<= 3; self->current_number_value |= c - '0'; } else if (is_alphabetic(c) || c == '_') { return lex_yield_error(self, c); } else { const size_t length = self->cursor - self->tok_offset; const struct token token = { TT_NUMOCT, self->current_number_value, self->tok_offset, length }; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_NUMHEX: if (is_hex(c)) { self->current_number_value <<= 4; self->current_number_value |= hex_digit_to_int(c); } else if (is_alphabetic(c) || c == '_') { // Panik! return lex_yield_error(self, c); } else { const size_t length = self->cursor - self->tok_offset; const struct token token = { TT_NUMHEX, self->current_number_value, self->tok_offset, length }; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_NUMDEC: if (is_dec(c)) { self->current_number_value *= 10; self->current_number_value += c - '0'; } else if (is_alphabetic(c) || c == '_') { return lex_yield_error(self, c); } else { const size_t length = self->cursor - self->tok_offset; const struct token token = { TT_NUMDEC, self->current_number_value, self->tok_offset, length }; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_STRING: if (c == '\\') { self->state = LS_STRING_ESC; } else if (c == '"') { const size_t length = self->cursor - self->tok_offset + 1; const struct token token = {TT_STRING, 0, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished self->state = LS_FREE; } break; case LS_STRING_ESC: self->state = LS_STRING; break; case LS_COMMENT_ASTERISK: if (c == '\r' || c == '\n') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_COMMENT_ASTERISK, 0, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_COMMENT_SEMICOLON: if (c == '\r' || c == '\n') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_COMMENT_SEMICOLON, 0, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_ERROR: return ERR; case LS_EOF: UNREACHABLE(); } return CONTINUE; } /** Advance lexer to produce new token. * \returns EOF if end of file reached. * \returns ERR if error encountered and lexing cannot continue. * \returns OK has one or more new tokens parsed. */ static int lex_next(struct lex *const self, FILE *const stream) { for (;; self->cursor++) { const int c = fgetc(stream); const char c_char = (c == EOF) ? 0 : c; fwrite(&c_char, sizeof c_char, 1, self->input_stream); const int ret = lex_handle_next(self, c); if (OK == ret) { return OK; } else if (ERR == ret) { return ERR; } if (c == EOF) { // Add a hidden EOF token of 0 size lex_yield_token(self, &(struct token){TT_NONE, 0, self->cursor, 0}); break; } } return EOF; } /** Run lexer until the end of the input reached * \returns OK if lexing finished successfully * \returns ERR if error encountered and lexing cannot continue. */ static int lex_run(struct lex *const self, FILE *const stream) { int res; do { res = lex_next(self, stream); if (res == OK) { res = 0; } else if (res == ERR) { return ERR; } } while (res != EOF); fflush(self->input_stream); fflush(self->tokbuf_stream); return OK; } static void lex_destroy(struct lex *const self) { fclose(self->input_stream); free(self->input); fclose(self->tokbuf_stream); free(self->tokbuf); } static enum args_count get_args_count_for_mnemonic(const enum mnemonic m) { assert(m < MNEMONICS_COUNT); return g_mnemmonics[m].args_count; } static const char *mnemonic_to_string(const enum mnemonic m) { assert(m < MNEMONICS_COUNT); return g_mnemmonics[m].str; } static const char *directive_to_string(const enum directive_type t) { assert(t < DIRECTIVES_COUNT); return g_directives[t].str; } static const char *opsize_to_string(const enum opsize s) { switch (s) { case OPSIZE_NONE: return "none"; case OPSIZE_S: return "short"; case OPSIZE_B: return "byte"; case OPSIZE_W: return "word"; case OPSIZE_L: return "long"; } UNREACHABLE(); return "_unknown"; } static char opsize_to_char(const enum opsize s) { switch (s) { case OPSIZE_NONE: return '_'; case OPSIZE_S: return 's'; case OPSIZE_B: return 'b'; case OPSIZE_W: return 'w'; case OPSIZE_L: return 'l'; } UNREACHABLE(); return '?'; } static enum mnemonic get_mnemonic_from_identifier( const char *const str, const size_t str_length) { if (str_length > 7) { return MN_NONE; } char mnemonic_str[8] = {0}; for (size_t i = 0; i < str_length; i++) { mnemonic_str[i] = tolower(str[i]); } // Start from 1 since - is dummy NONE for (size_t i = 1; i < MNEMONICS_COUNT; i++) { if (0 == strcmp(mnemonic_str, g_mnemmonics[i].str)) { return (enum mnemonic)i; } } return MN_NONE; } static enum directive_type get_directive_from_identifier( const char *const str, const size_t str_length) { // The longest directive have 8 chars (without leading dot), e.g. // "bsection" or "external". if (str_length > 8) { return DT_NONE; } char directive_str[9] = {0}; for (size_t i = 0; i < str_length; i++) { directive_str[i] = tolower(str[i]); } // Start from 1 since - is dummy NONE for (size_t i = 1; i < DIRECTIVES_COUNT; i++) { if (0 == strcmp(directive_str, g_directives[i].str)) { return (enum directive_type)i; } } return DT_NONE; } static const char *arg_type_to_string(const enum arg_type type) { switch (type) { case ARG_NONE: return "NONE"; case ARG_DN: return "Dn"; case ARG_AN: return "An"; case ARG_AN_ADDR: return "(An)"; case ARG_AN_ADDR_INCR: return "(An)+"; case ARG_AN_ADDR_DECR: return "-(An)"; case ARG_AN_ADDR_16: return "(d16,An)"; case ARG_AN_ADDR_8_XI: return "(d8,An,Xi)"; case ARG_ADDR_WORD: return "(xxx).w"; case ARG_ADDR_LONG: return "(xxx).l"; case ARG_ADDR_UNSPEC: return "(xxx).?"; case ARG_PC_ADDR_16: return "(d16,PC)"; case ARG_PC_ADDR_8_XI: return "(d8,PC,Xn)"; case ARG_IMMEDIATE: return "#imm"; case ARG_REGMASK: return "REGMASK"; case ARG_SR: return "SR"; case ARG_CCR: return "CCR"; case ARG_USP: return "USP"; } UNREACHABLE(); return "_UNKNOWN"; } static int pars_init(struct pars *const self, const struct lex *const lex) { *self = (struct pars){ .lex = lex, .stmttab_stream = open_memstream( (char **)&self->stmttab, &self->stmttab_size), .symtab_stream = open_memstream( (char **)&self->symtab, &self->symtab_size), .symbuf_stream = open_memstream(&self->symbuf, &self->symbuf_size), }; assert(self->stmttab_stream != NULL); assert(self->symtab_stream != NULL); assert(self->symbuf_stream != NULL); return OK; } static bool pars_is_eof_reached(const struct pars *const self) { return self->cur_tok_id >= self->lex->tokens_count; } static const char *stmt_type_to_string(const enum stmt_type type) { switch (type) { case ST_NONE: return "NONE"; case ST_LABEL: return "LABEL"; case ST_INSTRUCTION: return "INSTRUCTION"; case ST_ASSIGNMENT: return "ASSIGNMENT"; case ST_COMMENT: return "COMMENT"; case ST_DIRECTIVE: return "DIRECTIVE"; case ST_META_SAT: return "META_SAT"; } return "_UNKNOWN"; } static int fprint_tokens( const struct lex *const lex, const size_t first_token, const size_t num_tokens, FILE *const s) { for (size_t i = 0; i < num_tokens; i++) { const struct token token = lex->tokbuf[first_token + i]; if (token.type == TT_NEWLINE) { break; } if (i > 0) { fputc(' ', s); } fprintf(s, "\"%.*s\"", (int)token.length, lex->input + token.offset); } return 0; } static void fprint_expr( const struct lex *const lex, const struct expr *const expr, FILE *const s) { fputc('[', s); for (size_t i = 0; i < expr->num_tokens; i++) { const struct token token = lex->tokbuf[expr->first_token + i]; if (token.type == TT_NEWLINE) { break; } fprintf(s, "%.*s", (int)token.length, lex->input + token.offset); } fputc(']', s); } static void fprint_arg_debug( const struct lex *const lex, const struct arg *const arg, FILE *const s) { fprintf(s, "(%s", arg_type_to_string(arg->type)); switch (arg->type) { case ARG_DN: fprintf(s, " reg [d%d]", arg->xn); break; case ARG_AN: case ARG_AN_ADDR: case ARG_AN_ADDR_INCR: case ARG_AN_ADDR_DECR: fprintf(s, " reg [a%d]", arg->xn); break; case ARG_AN_ADDR_16: fprintf(s, " reg [a%d]", arg->xn); fprintf(s, " d16 "), fprint_expr(lex, &arg->expr, s); break; case ARG_AN_ADDR_8_XI: fprintf(s, " reg [a%d]", arg->xn); fprintf(s, " d8 "), fprint_expr(lex, &arg->expr, s); fprintf(s, " xi [%c%d]", arg->xi & 0x8 ? 'a' : 'd', arg->xi & 0x7); { const char size = arg->briefext_size == OPSIZE_L ? 'l' : 'w'; fprintf(s, " briefext_size [%c]", size); } break; case ARG_ADDR_WORD: case ARG_ADDR_LONG: case ARG_ADDR_UNSPEC: fprintf(s, " addr "), fprint_expr(lex, &arg->expr, s); break; case ARG_PC_ADDR_16: fprintf(s, " reg [pc]"); fprintf(s, " d16 "), fprint_expr(lex, &arg->expr, s); break; case ARG_PC_ADDR_8_XI: fprintf(s, " reg [pc]"); fprintf(s, " d8 "), fprint_expr(lex, &arg->expr, s); fprintf(s, " xi [%c%d]", arg->xi & 0x8 ? 'a' : 'd', arg->xi & 0x7); break; case ARG_IMMEDIATE: fprintf(s, " value "), fprint_expr(lex, &arg->expr, s); break; case ARG_REGMASK: fprintf(s, " regs ["); { bool leading_space = false; for (unsigned i = 0; i < 8; i++) { if (arg->regmask & (1 << i)) { fprintf(s, "%sd%d", leading_space ? " " : "", i); leading_space = true; } } for (unsigned i = 0; i < 8; i++) { if (arg->regmask & (1 << (i + 8))) { fprintf(s, "%sa%d", leading_space ? " " : "", i); leading_space = true; } } } fprintf(s, "]"); break; case ARG_SR: fprintf(s, "reg [sr]"); break; case ARG_CCR: fprintf(s, "reg [ccr]"); break; case ARG_USP: fprintf(s, "reg [usp]"); break; case ARG_NONE: break; } fprintf(s, " raw-tokens ["); fprint_tokens(lex, arg->first_token, arg->num_tokens, s); fprintf(s, "])"); } static int fprint_stmt_debug( const struct lex *const lex, struct stmt *const stmt, FILE *const s) { assert(stmt); fprintf(s, "(%s", stmt_type_to_string(stmt->type)); if (stmt->label_token) { const struct token label = lex->tokbuf[stmt->label_token]; fprintf( s, "\n\t(label \"%.*s\")", (int)label.length, lex->input + label.offset); } if (stmt->type == ST_INSTRUCTION) { fprintf( s, "\n\t(mnemonic \"%s\")", mnemonic_to_string(stmt->instruction.mnemonic)); fprintf(s, "\n\t(size %s)", opsize_to_string(stmt->instruction.opsize)); if (stmt->instruction.arg1.type != ARG_NONE) { fprintf(s, "\n\t(arg1 "); fprint_arg_debug(lex, &stmt->instruction.arg1, s); fprintf(s, ")"); } if (stmt->instruction.arg2.type != ARG_NONE) { assert(stmt->instruction.arg1.type != ARG_NONE); fprintf(s, "\n\t(arg2 "); fprint_arg_debug(lex, &stmt->instruction.arg2, s); fprintf(s, ")"); } } else if (stmt->type == ST_DIRECTIVE) { fprintf( s, "\n\t(name \"%s\")", directive_to_string(stmt->directive.type)); if (stmt->directive.first_token && stmt->directive.num_tokens) { fprintf(s, "\n\t(arg (raw-tokens ["); fprint_tokens( lex, stmt->directive.first_token, stmt->directive.num_tokens, s); fprintf(s, "]))"); } } if (stmt->comment_token) { const struct token comment = lex->tokbuf[stmt->comment_token]; fprintf( s, "\n\t(comment \"%.*s\")", (int)comment.length, lex->input + comment.offset); } fprintf(s, "\n\t(raw-tokens ["); fprint_tokens(lex, stmt->first_token, stmt->num_tokens, s); fprintf(s, "]))\n"); return 0; } static int fwrite_stmt(const struct stmt *const stmt, FILE *const stream) { const int res = fwrite(stmt, sizeof *stmt, 1, stream); assert(res == 1); return res; } static size_t get_arg_size( const struct arg *const arg, const enum opsize opsize) { switch (arg->type) { case ARG_NONE: return 0; case ARG_DN: return 0; case ARG_AN: return 0; case ARG_AN_ADDR: return 0; case ARG_AN_ADDR_INCR: return 0; case ARG_AN_ADDR_DECR: return 0; case ARG_AN_ADDR_16: return 2; case ARG_AN_ADDR_8_XI: return arg->briefext_size == OPSIZE_W ? 2 : 4; // FIXME I'm not sure how it works case ARG_ADDR_WORD: return 2; case ARG_ADDR_LONG: return 4; case ARG_ADDR_UNSPEC: return 2; // FIXME I'm not sure how it works case ARG_PC_ADDR_16: return 2; case ARG_PC_ADDR_8_XI: return arg->briefext_size == OPSIZE_W ? 2 : 4; // FIXME I'm not sure how it works case ARG_IMMEDIATE: switch (opsize) { case OPSIZE_S: case OPSIZE_B: return 1; case OPSIZE_W: return 2; case OPSIZE_NONE: // FIXME it should depend on the value, I guess case OPSIZE_L: return 4; } break; case ARG_REGMASK: return 2; case ARG_SR: return 0; case ARG_CCR: return 0; case ARG_USP: return 0; } UNREACHABLE(); return 0; } static size_t get_instruction_size(const struct instruction *const instr) { return 2 + get_arg_size(&instr->arg1, instr->opsize) + get_arg_size(&instr->arg2, instr->opsize); } static void pars_put_stmt( struct pars *const self, struct stmt *const stmt) { if (stmt->label_token) { // fflush is necessary to update stmttab_size variable fflush(self->stmttab_stream); const struct sym sym = { .stmt_id = self->stmttab_size / (sizeof *self->stmttab), .addr = self->addr_cursor, }; const int res = fwrite(&sym, sizeof sym, 1, self->symtab_stream); assert(res == 1); (void) res; } if (stmt->type == ST_INSTRUCTION) { stmt->addr = self->addr_cursor; self->addr_cursor += get_instruction_size(&stmt->instruction); } fwrite_stmt(stmt, self->stmttab_stream); } static struct token pars_peek(const struct pars *const self) { return self->lex->tokbuf[self->cur_tok_id]; } static struct token pars_peek_more( const struct pars *const self, const size_t more) { return self->lex->tokbuf[self->cur_tok_id + more]; } static size_t pars_commit(struct pars *const self) { return self->cur_tok_id++; } static void pars_skip_to_newline(struct pars *const self) { // Reset state In case of inside of the .def .. .endef block self->in_sat = false; while (!pars_is_eof_reached(self)) { const struct token nl = pars_peek(self); pars_commit(self); if (nl.type == TT_NEWLINE) { return; } } } static int pars_yield_error_msg( struct pars *const self, const size_t token_id, const char *const msg) { const struct token token = self->lex->tokbuf[token_id]; const struct line_pos_info l = lex_get_line_pos_info(self->lex, token.offset); fprintf( stderr, ":%lu:%lu: parsing error: %s\n", l.line_num + 1, l.column_num + 1, msg); const char *const line = self->lex->input + l.line_offset; const size_t line_length = find_line_length(line); fprintf(stderr, "%5lu | %.*s\n", l.line_num, (int)line_length, line); fputs(" | ", stderr); for (size_t i = 0; i < l.column_num; i++) { if (self->lex->input[l.line_offset + i] == '\t') { fputc('\t', stderr); } else { fputc(' ', stderr); } } fputs("^\n", stderr); pars_skip_to_newline(self); return ERR; } static int pars_yield_error_expected_str( struct pars *const self, const struct line_pos_info l, const char *const found, const size_t found_length, const char *const expected) { fprintf( stderr, ":%lu:%lu: parsing error: expected %s, found '", l.line_num + 1, l.column_num + 1, expected); fprint_string_escaped(found, found_length, stderr); fputs("'\n", stderr); const char *const line = self->lex->input + l.line_offset; const size_t line_length = find_line_length(line); fprintf( stderr, "%5lu | %.*s\n", l.line_num + 1, (int)line_length, line); fputs(" | ", stderr); for (size_t i = 0; i < l.column_num; i++) { if (self->lex->input[l.line_offset + i] == '\t') { fputc('\t', stderr); } else { fputc(' ', stderr); } } fputc('^', stderr); for (size_t i = 1; i < found_length; i++) { fputc('~', stderr); } fputc('\n', stderr); pars_skip_to_newline(self); return ERR; } static int pars_yield_error( struct pars *const self, const size_t token_id, const char *const expected) { const struct token token = self->lex->tokbuf[token_id]; const struct line_pos_info l = lex_get_line_pos_info(self->lex, token.offset); const char *const found = self->lex->input + token.offset; return pars_yield_error_expected_str(self, l, found, token.length, expected); } static int pars_yield_error_eof( struct pars *const self, const char *const expected) { const struct token token = self->lex->tokbuf[self->cur_tok_id]; const struct line_pos_info l = lex_get_line_pos_info(self->lex, token.offset); return pars_yield_error_expected_str( self, l, "EOF", (sizeof "EOF") - 1, expected); } enum opsize get_opsize_from_specifier(const char size_specifier) { switch (tolower(size_specifier)) { case 's': return OPSIZE_S; case 'b': return OPSIZE_B; case 'w': return OPSIZE_W; case 'l': return OPSIZE_L; } return OPSIZE_NONE; } static bool is_pc(const char *const str) { return (str[0] == 'p' && str[1] == 'c') || (str[0] == 'P' && str[1] == 'C'); } static bool is_sp(const char *const str) { return (str[0] == 's' && str[1] == 'p') || (str[0] == 'S' && str[1] == 'P'); } static bool is_sr(const char *const str) { return (str[0] == 's' && str[1] == 'r') || (str[0] == 'S' && str[1] == 'R'); } static bool is_ccr(const char *const str) { return ((str[0] == 'c' && str[1] == 'c' && str[2] == 'r') || (str[0] == 'C' && str[1] == 'C' && str[2] == 'R')); } static bool is_usp(const char *const str) { return ((str[0] == 'u' && str[1] == 's' && str[2] == 'p') || (str[0] == 'U' && str[1] == 'S' && str[2] == 'P')); } static struct token_recognition pars_recognize_token( const struct pars *const self, const struct token token) { const char *const str = self->lex->input + token.offset; if (token.type == TT_ID) { if (token.length == 2) { if (tolower(str[0]) == 'a' && is_oct(str[1])) { return (struct token_recognition){ .type = RTT_REG, .reg = REG_AN, .reg_num = str[1] - '0', }; } else if (tolower(str[0]) == 'd' && is_oct(str[1])) { return (struct token_recognition){ .type = RTT_REG, .reg = REG_DN, .reg_num = str[1] - '0', }; } else if (is_sp(str)) { return (struct token_recognition){ .type = RTT_REG, .reg = REG_AN, .reg_num = 7, }; } else if (is_pc(str)) { return (struct token_recognition){ .type = RTT_REG, .reg = REG_PC, }; } else if (is_sr(str)) { return (struct token_recognition){ .type = RTT_REG, .reg = REG_SR, }; } } else if (token.length == 3) { if (is_ccr(str)) { return (struct token_recognition){ .type = RTT_REG, .reg = REG_DN, }; } else if (is_usp(str)) { return (struct token_recognition){ .type = RTT_REG, .reg = REG_USP, }; } } } else if (token.type == TT_NUMDEC) { // TODO } else if (token.type == TT_NUMOCT) { // TODO } else if (token.type == TT_NUMHEX) { // TODO } return (struct token_recognition){0}; } struct expr_value { enum token_type operator; int32_t value; bool negative; bool bit_inverted; }; static int32_t apply_binary_operator(int32_t a, int32_t b, enum token_type op) { switch (op) { case TT_ASTERISK: return a * b; case TT_SLASH: return a / b; case TT_LSHIFT: return a << b; case TT_RSHIFT: return a >> b; case TT_AMPERSAND: return a & b; case TT_PIPE: return a | b; case TT_CAP: return a ^ b; case TT_NONE: case TT_PLUS: return a + b; default: break; } UNREACHABLE(); assert(false); } static int pars_parse_expr( struct pars *const self, struct expr *const expr, int flags) { // This function is called only when expression is expected unconditionally, // so if the first token cannot be a part of expression, then error must be // yielded. const size_t first_token_id = self->cur_tok_id; const char *const e_expr_open = (flags & PARS_EXPR_FLAG_ALLOW_ID) ? E_EXPR_OPEN_WITH_ID : E_EXPR_OPEN; const char *const e_expr_close = (flags & PARS_EXPR_FLAG_ALLOW_ID) ? E_EXPR_CLOSE_WITH_ID : E_EXPR_CLOSE; struct expr_value stack[EXPR_NESTING_MAX] = {{0}}; bool value_is_resolved = true; unsigned nesting = 0; // Otherwise expect open parenthesis, number, or unary operator. bool expect_close_or_binary = false; while (1) { if (pars_is_eof_reached(self)) { if (nesting != 0) { assert(pars_is_eof_reached(self)); return pars_yield_error_eof( self, expect_close_or_binary ? e_expr_close : e_expr_open); } break; } const struct token token = pars_peek(self); if (token.type == TT_LPAREN) { if (expect_close_or_binary) { if (nesting == 0) { break; } return pars_yield_error(self, self->cur_tok_id, e_expr_close); } else { nesting++; if (nesting >= EXPR_NESTING_MAX) { return pars_yield_error_msg(self, self->cur_tok_id, E_MAX_NESTING); } } } else if (token.type == TT_MINUS) { // Minus is both unary and binary operator, so it does not care // about expression parsing state expect_close_or_binary = false; stack[nesting].negative = !stack[nesting].negative; } else if (token.type == TT_TILDE) { if (expect_close_or_binary) { return pars_yield_error(self, self->cur_tok_id, e_expr_close); } stack[nesting].bit_inverted = !stack[nesting].bit_inverted; } else if (token.type == TT_ID) { if (0 == (flags & PARS_EXPR_FLAG_ALLOW_ID)) { return pars_yield_error( self, self->cur_tok_id, expect_close_or_binary ? e_expr_close : e_expr_open); } value_is_resolved = false; if (expect_close_or_binary) { return pars_yield_error(self, self->cur_tok_id, e_expr_close); } if (pars_recognize_token(self, token).type == RTT_REG) { return pars_yield_error(self, self->cur_tok_id, E_EXPR_NONREG); } expect_close_or_binary = true; stack[nesting].operator = TT_NONE; stack[nesting].negative = false; } else if (token_is_number(token.type)) { if (expect_close_or_binary) { return pars_yield_error(self, self->cur_tok_id, e_expr_close); } expect_close_or_binary = true; int32_t value = token.value; if (stack[nesting].negative) { value = -value; } if (stack[nesting].bit_inverted) { value = ~value; } stack[nesting].value = apply_binary_operator( stack[nesting].value, value, stack[nesting].operator); stack[nesting].operator = TT_NONE; } else if (token_is_binary_operator(token.type)) { if (!expect_close_or_binary) { return pars_yield_error(self, self->cur_tok_id, e_expr_open); } expect_close_or_binary = false; stack[nesting].operator = token.type; } else if (token.type == TT_RPAREN) { if (!expect_close_or_binary) { return pars_yield_error(self, self->cur_tok_id, e_expr_open); } if (nesting == 0) { // This is not my closing parenthesis, should stop break; } nesting--; int32_t value = stack[nesting + 1].value; if (stack[nesting].negative) { value = -value; } if (stack[nesting].bit_inverted) { value = ~value; } stack[nesting].value = apply_binary_operator( stack[nesting].value, value, stack[nesting].operator); stack[nesting].operator = TT_NONE; } else { if (nesting == 0 && expect_close_or_binary) { break; } return pars_yield_error( self, self->cur_tok_id, expect_close_or_binary ? e_expr_close : e_expr_open); } pars_commit(self); } assert(first_token_id != self->cur_tok_id); *expr = (struct expr){ .first_token = first_token_id, .num_tokens = self->cur_tok_id - first_token_id, .value = stack[nesting].value, .value_is_resolved = value_is_resolved, }; return OK; } static int pars_parse_comment_and_newline2( struct pars *const self, size_t *const output_comment_id, const bool allow_escape) { size_t comment_id = 0; if (!pars_is_eof_reached(self)) { // Try parse comment const struct token token1 = pars_peek(self); const bool is_comment = token1.type == TT_COMMENT_ASTERISK || token1.type == TT_COMMENT_SEMICOLON; if (is_comment) { comment_id = pars_commit(self); } } if (!pars_is_eof_reached(self)) { // There must be a new line if not EOF const size_t nl_id = pars_commit(self); const struct token nl = self->lex->tokbuf[nl_id]; if (nl.type != TT_NEWLINE && (allow_escape && nl.type != TT_ESCAPE)) { return pars_yield_error( self, nl_id, comment_id ? E_NL : E_COMMENT_NL); } } *output_comment_id = comment_id; return OK; } static int pars_parse_comment_and_newline( struct pars *const self, size_t *const output_comment_id) { return pars_parse_comment_and_newline2(self, output_comment_id, false); } static int pars_finish_directive( struct pars *const self, const size_t label_id, const struct directive directive) { // Finish parsing instruction, expect comment or newline, or even escape // symbol in some cases. const bool allow_escape = directive.type == DT_DEF || directive.type == DT_DIM || directive.type == DT_LINE || directive.type == DT_SCL || directive.type == DT_SIZE || directive.type == DT_TAG || directive.type == DT_TYPE || directive.type == DT_VAL ; size_t comment_id = 0; const int ret = pars_parse_comment_and_newline2( self, &comment_id, allow_escape); if (ret != OK) { return ret; } const size_t first_token = label_id ? label_id : directive.name_token; struct stmt stmt = { .type = ST_DIRECTIVE, .directive = directive, .label_token = label_id, .comment_token = comment_id, .first_token = first_token, .num_tokens = self->cur_tok_id - first_token, }; pars_put_stmt(self, &stmt); if (directive.type == DT_ENDEF) { struct stmt stmt = { .type = ST_META_SAT, .sat = self->sat, }; pars_put_stmt(self, &stmt); } return OK; } static int pars_directive_skip( struct pars *const self, const enum directive_type drc, const size_t label_id) { const size_t name_token = self->cur_tok_id - 1; const size_t first_token = self->cur_tok_id; size_t num_tokens = 0; while (1) { const struct token token = pars_peek(self); const bool is_end = token.type == TT_COMMENT_SEMICOLON || token.type == TT_NEWLINE || token.type == TT_ESCAPE; if (is_end) { break; } pars_commit(self); num_tokens++; } const struct directive directive = { drc, name_token, first_token, num_tokens }; return pars_finish_directive(self, label_id, directive); } static int pars_directive_handler_def( struct pars *const self, const enum directive_type drc, const size_t label_id) { const size_t name_token = self->cur_tok_id - 1; const struct token arg_token = pars_peek(self); if (arg_token.type != TT_ID) { return pars_yield_error(self, self->cur_tok_id, E_ID); } if (self->in_sat) { return pars_yield_error_msg(self, self->cur_tok_id, E_NESTED_DEF); } self->sat = (struct sat){0}; self->in_sat = true; self->sat.def_arg = (struct expr){self->cur_tok_id, 1, 0, false}; const struct directive directive = { drc, name_token, pars_commit(self), 1 }; return pars_finish_directive(self, label_id, directive); } static int pars_directive_handler_endef( struct pars *const self, const enum directive_type drc, const size_t label_id) { if (!self->in_sat) { return pars_yield_error_msg(self, self->cur_tok_id, E_NMATCH_ENDEF); } self->in_sat = false; const size_t name_token = self->cur_tok_id - 1; const struct directive directive = { drc, name_token, 0, 0 }; return pars_finish_directive(self, label_id, directive); } static int pars_directive_handler_scl( struct pars *const self, const enum directive_type drc, const size_t label_id) { const size_t name_token = self->cur_tok_id - 1; if (!self->in_sat) { return pars_yield_error_msg(self, self->cur_tok_id, E_NMATCH_ENDEF); } if (self->sat.scl_arg.first_token) { return pars_yield_error_msg(self, self->cur_tok_id, E_MULTIPLE_SCL); } struct expr expr = {0}; const int ret = pars_parse_expr(self, &expr, 0); if (ret != OK) { return ret; } assert(expr.value_is_resolved); const struct directive directive = { drc, name_token, expr.first_token, expr.num_tokens }; self->sat.scl_arg = expr; return pars_finish_directive(self, label_id, directive); } static int pars_directive_handler_type( struct pars *const self, const enum directive_type drc, const size_t label_id) { const size_t name_token = self->cur_tok_id - 1; if (!self->in_sat) { return pars_yield_error_msg(self, self->cur_tok_id, E_NMATCH_ENDEF); } if (self->sat.type_arg.first_token) { return pars_yield_error_msg(self, self->cur_tok_id, E_MULTIPLE_TYPE); } struct expr expr = {0}; const int ret = pars_parse_expr(self, &expr, 0); if (ret != OK) { return ret; } assert(expr.value_is_resolved); const struct directive directive = { drc, name_token, expr.first_token, expr.num_tokens }; self->sat.type_arg = expr; return pars_finish_directive(self, label_id, directive); } static int pars_directive_handler_val( struct pars *const self, const enum directive_type drc, const size_t label_id) { const size_t name_token = self->cur_tok_id - 1; const struct token arg_token = pars_peek(self); if (!self->in_sat) { return pars_yield_error_msg(self, self->cur_tok_id, E_NMATCH_ENDEF); } if (self->sat.val_arg.first_token) { return pars_yield_error_msg(self, self->cur_tok_id, E_MULTIPLE_VAL); } struct expr expr = {self->cur_tok_id, 1, 0, false}; if (arg_token.type == TT_ID || arg_token.type == TT_DOT) { pars_commit(self); } else { const int ret = pars_parse_expr(self, &expr, 0); if (ret != OK) { return ret; } assert(expr.value_is_resolved); } const struct directive directive = { drc, name_token, expr.first_token, expr.num_tokens }; self->sat.val_arg = expr; return pars_finish_directive(self, label_id, directive); } static int pars_parse_direc(struct pars *const self, const size_t label_id) { const struct token dotid = pars_peek(self); // Get rid of leading dot in the string pointer and in the length as well by // adding and subtracting 1 respectively enum directive_type d = get_directive_from_identifier( self->lex->input + dotid.offset + 1, dotid.length - 1); if (d == DT_NONE) { return pars_yield_error_msg(self, self->cur_tok_id, E_UNKNOWN_DRC); } pars_commit(self); return g_directives[d].handler(self, d, label_id); } static int pars_parse_arg_after_prefix_expr( struct pars *const self, struct arg *const arg) { // At this point a single expression has been parsed and committed. // It can be one of: // - Standalone expression // - Standalone expression with size suffix like ".l" // - Prefix expression followed by (An), (PC), (An,Xn) or (PC,Xn) if (pars_is_eof_reached(self)) { // It was a standalone expression without size suffix, yield an // argument from here arg->type = ARG_ADDR_UNSPEC; return OK; } struct token token0 = pars_peek(self); if (token0.type == TT_DOT_ID) { // It must be a size specifier, or error otherwise const size_t size_spec_id = pars_commit(self); if (token0.length != 2) { return pars_yield_error(self, size_spec_id, E_ADDR_SIZE_SPEC); } const enum opsize addrsize = get_opsize_from_specifier(self->lex->input[token0.offset + 1]); if (addrsize == OPSIZE_NONE || addrsize == OPSIZE_S) { return pars_yield_error(self, size_spec_id, E_ADDR_SIZE_SPEC); } arg->type = addrsize == OPSIZE_L ? ARG_ADDR_LONG : ARG_ADDR_WORD; } else if (token0.type != TT_LPAREN) { // It was a standalone expression without size suffix, yield an // argument from here arg->type = ARG_ADDR_UNSPEC; return OK; } // Suffix must be read anyway if (pars_peek(self).type == TT_LPAREN) { // It is a prefix expression for (An), (PC), (An,Xn) or (PC,Xn) pars_commit(self); return pars_parse_arg_inside_parens(self, arg); } // It is a standalone expression with or without size suffix, yield an // argument from here return OK; } static int pars_parse_arg_starts_with_minus( struct pars *const self, struct arg *const arg) { // At this point cur_tok_id points to the minus that has been peeked, but // not committed. const size_t first_token_id = self->cur_tok_id; if (pars_is_eof_reached(self)) { pars_commit(self); // The minus token // Just single minus is invalid expression return pars_yield_error_eof(self, "'(' or expression"); } if (pars_peek_more(self, 1).type == TT_LPAREN) { // It is still either expression or -(An) if (pars_is_eof_reached(self)) { // "-(" is invalid expression pars_commit(self), pars_commit(self); // Commit "-" and "(" return pars_yield_error_eof(self, "An or expression"); } const struct token token2 = pars_peek_more(self, 2); if (token2.type == TT_ID) { struct token_recognition r = pars_recognize_token(self, token2); if (r.type == RTT_REG && r.reg == REG_AN) { // It is definitely -(An). Commit all previous tokens and // expect closing parenthesis. self->cur_tok_id += 3; const size_t rparen_id = pars_commit(self); const struct token rparen = self->lex->tokbuf[rparen_id]; if (rparen.type == TT_RPAREN) { // Perfect! *arg = (struct arg){ .type = ARG_AN_ADDR_DECR, .xn = r.reg_num, .first_token = first_token_id, .num_tokens = self->cur_tok_id - first_token_id, }; return OK; } else { // But it has to be a closing parenthesis! return pars_yield_error(self, rparen_id, "')'"); } } } } // Otherwise it is expression - either prefix or standalone const int ret = pars_parse_expr(self, &arg->expr, PARS_EXPR_FLAG_ALLOW_ID); if (ret != OK) { return ret; } return pars_parse_arg_after_prefix_expr(self, arg); } struct inside_parens_state { bool an1_found, an2_found, dn_found, pc_found; enum opsize size; uint8_t an1, an2, dn; }; static int pars_parse_arg_inside_parens_single_item( struct pars *const self, struct arg *const arg, struct inside_parens_state *const state) { const struct token token0 = pars_peek(self); if (token0.type == TT_ID) { // It it may be An/Dn/PC register struct token_recognition r = pars_recognize_token(self, token0); if (r.type == RTT_REG) { // This is definitely a register or regmask. bool it_is_pc = false; const size_t token0_id = pars_commit(self); struct token_recognition r = pars_recognize_token(self, token0); switch (r.reg) { case REG_DN: state->dn_found = true; state->dn = r.reg_num; const struct token token_size_spec = pars_peek(self); if (token_size_spec.type == TT_DOT_ID) { // It must be a size specifier, or error otherwise const size_t size_spec_id = pars_commit(self); if (token_size_spec.length != 2) { return pars_yield_error( self, size_spec_id, E_ADDR_INDIR_SIZE_SPEC); } const char c = self->lex->input[token_size_spec.offset + 1]; const enum opsize addrsize = get_opsize_from_specifier(c); if (addrsize != OPSIZE_W && addrsize != OPSIZE_L) { return pars_yield_error( self, size_spec_id, E_ADDR_INDIR_SIZE_SPEC); } if (state->size != OPSIZE_NONE) { return pars_yield_error_msg( self, size_spec_id, E_ADDR_INDIR_MULTIPLE_INDEX_REGS); } state->size = addrsize; } break; case REG_AN: if (!state->an1_found) { state->an1_found = true; state->an1 = r.reg_num; } else if (!state->an2_found) { state->an2_found = true; state->an2 = r.reg_num; } else { return pars_yield_error( self, token0_id, E_EA_PART_NOT_AN); } break; case REG_PC: state->pc_found = true; it_is_pc = true; break; case REG_NONE: UNREACHABLE(); case REG_SR: case REG_CCR: case REG_USP: return pars_yield_error(self, token0_id, E_EA_PART); } if (!it_is_pc) { const struct token token_size_spec = pars_peek(self); if (token_size_spec.type == TT_DOT_ID) { // It must be a size specifier, or error otherwise const size_t size_spec_id = pars_commit(self); if (token_size_spec.length != 2) { return pars_yield_error( self, size_spec_id, E_ADDR_INDIR_SIZE_SPEC); } const char c = self->lex->input[token_size_spec.offset + 1]; const enum opsize addrsize = get_opsize_from_specifier(c); if (addrsize != OPSIZE_W && addrsize != OPSIZE_L) { return pars_yield_error( self, size_spec_id, E_ADDR_INDIR_SIZE_SPEC); } if (state->size != OPSIZE_NONE) { return pars_yield_error_msg( self, size_spec_id, E_ADDR_INDIR_MULTIPLE_INDEX_REGS); } state->size = addrsize; } } } } else if (arg->expr.first_token == 0) { const int ret = pars_parse_expr(self, &arg->expr, PARS_EXPR_FLAG_ALLOW_ID); if (ret != OK) { return ret; } const struct token token_size_spec = pars_peek(self); if (token_size_spec.type == TT_DOT_ID) { // It must be a size specifier, or error otherwise const size_t size_spec_id = pars_commit(self); if (token_size_spec.length != 2) { return pars_yield_error(self, size_spec_id, E_ADDR_SIZE_SPEC); } const char c = self->lex->input[token_size_spec.offset + 1]; const enum opsize addrsize = get_opsize_from_specifier(c); if (addrsize == OPSIZE_NONE || addrsize == OPSIZE_S) { return pars_yield_error(self, size_spec_id, E_ADDR_SIZE_SPEC); } // Just skip it, because it does not matter } } else { return pars_yield_error(self, self->cur_tok_id, E_EA_PART_NOT_EXPR); } return OK; } static int pars_parse_arg_inside_parens( struct pars *const self, struct arg *const arg) { // At this point cur_tok_id points after the first opening parenthesis that // has been parsed (committed). // It can be // - (expr)(An) // - (expr)(An,Xi) or (expr)(Xi,An) // - (expr)(An,Xi.w) or (expr)(Xi.w,An) // - (expr)(PC,Xi) or (expr)(Xi,PC) // - (expr)(PC,Xi.w) or (expr)(Xi.w,PC) // - (An) or (An)+ // - (An,expr) or (expr,An) // - (PC,expr) or (expr,PC) // - (An,expr,Xi) in any order (6 variants) // - (An,expr,Xi.w) in any order (6 variants) // - (PC,expr,Xi) in any order (6 variants) // - (PC,expr,Xi.w) in any order (6 variants) unsigned parts = arg->expr.first_token ? 1 : 0; struct inside_parens_state state = {0}; while (parts < 3) { if (pars_is_eof_reached(self)) { return pars_yield_error_eof(self, E_EA_PART); } const int ret = pars_parse_arg_inside_parens_single_item(self, arg, &state); if (ret != OK) { return ret; } parts++; if (pars_is_eof_reached(self)) { return pars_yield_error_eof(self, E_EA_PART_DELIM); } const struct token delim = pars_peek(self); const size_t delim_id = pars_commit(self); if (delim.type == TT_COMMA) { continue; } else if (delim.type == TT_RPAREN) { if (parts == 1 && arg->expr.first_token) { assert(!state.an1_found && !state.an2_found && !state.dn_found && !state.pc_found); // It turns out we are inside of expression, so this closing // parenthesis is part of it. Let's accumulate it and move // on. arg->expr.first_token--; arg->expr.num_tokens += 2; return pars_parse_arg_after_prefix_expr(self, arg); } else { break; } } else { return pars_yield_error(self, delim_id, E_EA_PART); } } if (parts == 1 && state.an1_found) { // It is either (An) or (An)+ assert(!state.pc_found && !state.dn_found && !arg->expr.first_token); if (pars_is_eof_reached(self)) { arg->type = ARG_AN_ADDR; } else { const struct token plus = pars_peek(self); if (plus.type == TT_PLUS) { pars_commit(self); arg->type = ARG_AN_ADDR_INCR; } else { arg->type = ARG_AN_ADDR; } } arg->xn = state.an1; arg->num_tokens = self->cur_tok_id - arg->first_token; return OK; } else if (parts == 2 && state.an1_found && arg->expr.first_token) { // It is (An,d16) or (d16,An) assert(!state.an2_found && !state.pc_found && !state.dn_found); arg->type = ARG_AN_ADDR_16; arg->xn = state.an1; arg->num_tokens = self->cur_tok_id - arg->first_token; return OK; } else if (parts == 2 && state.pc_found && arg->expr.first_token) { // It is (PC,d16) or (d16,PC) assert(!state.an1_found && !state.an2_found && !state.dn_found); arg->type = ARG_PC_ADDR_16; arg->num_tokens = self->cur_tok_id - arg->first_token; return OK; } else if (parts == 3 && state.pc_found && arg->expr.first_token && (state.an1_found || state.dn_found)) { // It is (d8,PC,Xn) assert((state.an1_found && !state.dn_found) || (!state.an1_found && state.dn_found)); arg->type = ARG_PC_ADDR_8_XI; arg->xi = state.an1_found ? (state.an1 | 0x8) : state.dn; arg->num_tokens = self->cur_tok_id - arg->first_token; arg->briefext_size = state.size; return OK; } else if (parts == 3 && state.an1_found && arg->expr.first_token && (state.an2_found || state.dn_found)) { // It is (d8,An,Xn) assert((state.an2_found && !state.dn_found) || (!state.an2_found && state.dn_found)); arg->type = ARG_AN_ADDR_8_XI; // FIXME an1 can be just an index register if it has size specifier, in // that case an2 should be used here arg->xn = state.an1; arg->xi = state.an2_found ? (state.an2 | 0x8) : state.dn; arg->num_tokens = self->cur_tok_id - arg->first_token; arg->briefext_size = state.size; return OK; } return pars_yield_error_msg(self, self->cur_tok_id, E_EA_INVALID); } static int pars_parse_arg_regmask( struct pars *const self, struct arg *const arg) { // At this point cur_tok_id points to the register token that has been // peeked, but not committed. bool range = false, delimiter = true, range_an = false; uint16_t regmask = 0; bool reg_found = false; uint8_t reg = 0; while (1) { if (pars_is_eof_reached(self)) { if (range) { return pars_yield_error_eof(self, range_an ? E_AN : E_DN); } return OK; } const struct token token = pars_peek(self); if (token.type == TT_ID) { struct token_recognition r = pars_recognize_token(self, token); if (r.type == RTT_REG) { if (r.reg == REG_AN) { if (range) { assert(reg_found); if (!range_an) { return pars_yield_error( self, self->cur_tok_id, E_AN); } if (r.reg_num < reg) { return pars_yield_error_msg( self, self->cur_tok_id, E_REGMASK_ASCEND); } range = false; for (int i = reg; i <= r.reg_num; i++) { regmask |= 1 << (i + 8); } reg_found = false; } else if (delimiter) { delimiter = false; reg_found = true; reg = r.reg_num; range_an = true; } else { return pars_yield_error( self, self->cur_tok_id, E_REGMASK_DELIM); } } else if (r.reg == REG_DN) { if (range) { assert(reg_found); if (range_an) { return pars_yield_error( self, self->cur_tok_id, E_DN); } if (r.reg_num < reg) { return pars_yield_error_msg( self, self->cur_tok_id, E_REGMASK_ASCEND); } range = false; for (int i = reg; i <= r.reg_num; i++) { regmask |= 1 << i; } reg_found = false; } else if (delimiter) { delimiter = false; reg_found = true; reg = r.reg_num; range_an = false; } else { return pars_yield_error( self, self->cur_tok_id, E_REGMASK_DELIM); } } else { return pars_yield_error( self, self->cur_tok_id, (range || delimiter) ? E_AN_DN : E_REGMASK_DELIM); } } else { return pars_yield_error( self, self->cur_tok_id, (range || delimiter) ? E_AN_DN : E_REGMASK_DELIM); } } else if (token.type == TT_SLASH) { if (range || delimiter) { return pars_yield_error(self, self->cur_tok_id, E_AN_DN); } if (reg_found) { reg_found = false; regmask |= 1 << (reg + (range_an ? 8 : 0)); } delimiter = true; } else if (token.type == TT_MINUS) { if (range || delimiter) { return pars_yield_error(self, self->cur_tok_id, E_AN_DN); } range = true; } else if (regmask) { // Do not commit here because it is not ours token if (reg_found) { reg_found = false; regmask |= 1 << (reg + (range_an ? 8 : 0)); } arg->type = ARG_REGMASK; arg->regmask = regmask; return OK; } else { return pars_yield_error( self, self->cur_tok_id, (range || delimiter) ? E_AN_DN : E_REGMASK_DELIM); } pars_commit(self); } UNREACHABLE(); return pars_yield_error_msg(self, ++self->cur_tok_id, E_UNREACH); } static int pars_parse_arg( struct pars *const self, struct arg *const arg) { if (pars_is_eof_reached(self)) { return OK; } const size_t first_token_id = self->cur_tok_id; arg->first_token = first_token_id; const struct token token0 = pars_peek(self); if (token0.type == TT_HASH) { // Definitely an immediate value expression pars_commit(self); const int ret = pars_parse_expr(self, &arg->expr, PARS_EXPR_FLAG_ALLOW_ID); if (ret != OK) { return ret; } arg->type = ARG_IMMEDIATE; arg->num_tokens = self->cur_tok_id - first_token_id; return OK; } else if (token0.type == TT_MINUS) { // It is either expression or -(An) return pars_parse_arg_starts_with_minus(self, arg); } else if (token0.type == TT_TILDE || token_is_number(token0.type)) { // Tilde is unary operation, so it must be an expression const int ret = pars_parse_expr(self, &arg->expr, PARS_EXPR_FLAG_ALLOW_ID); if (ret != OK) { return ret; } return pars_parse_arg_after_prefix_expr(self, arg); } else if (token0.type == TT_LPAREN) { // It is either expression or addressing mode (An) / (An)+ / (d16,An) / // (d8,An,Xn) / (d8,PC,Xn) / (d16,An) pars_commit(self); return pars_parse_arg_inside_parens(self, arg); } else if (token0.type == TT_ID) { // It is either expression, regmask or just An/Dn/PC/SR/SP/CCR register struct token_recognition r = pars_recognize_token(self, token0); if (r.type == RTT_REG) { // This is definitely a register or regmask. switch (r.reg) { case REG_NONE: UNREACHABLE(); return pars_yield_error_msg(self, first_token_id, E_UNREACH); case REG_DN: if (token_is_regmask_delimiter(pars_peek_more(self, 1).type)) { // Note: the register is not committed return pars_parse_arg_regmask(self, arg); } arg->type = ARG_DN; arg->xn = r.reg_num; break; case REG_AN: if (token_is_regmask_delimiter(pars_peek_more(self, 1).type)) { // Note: the register is not committed return pars_parse_arg_regmask(self, arg); } arg->type = ARG_AN; arg->xn = r.reg_num; break; case REG_PC: return pars_yield_error(self, first_token_id, E_AN_DN); case REG_SR: arg->type = ARG_SR; break; case REG_CCR: arg->type = ARG_CCR; break; case REG_USP: arg->type = ARG_USP; break; } pars_commit(self); arg->num_tokens = self->cur_tok_id - first_token_id; return OK; } else { const int ret = pars_parse_expr(self, &arg->expr, PARS_EXPR_FLAG_ALLOW_ID); if (ret != OK) { return ret; } return pars_parse_arg_after_prefix_expr(self, arg); } } return OK; } static int pars_yield_instruction( struct pars *const self, const size_t label_id, const size_t comment_id, const size_t mnemonic_id, const enum opsize opsize, const struct arg *const arg1, const struct arg *const arg2) { const struct token mnemonic_token = self->lex->tokbuf[mnemonic_id]; const enum mnemonic mnemonic = get_mnemonic_from_identifier( self->lex->input + mnemonic_token.offset, mnemonic_token.length); if (mnemonic == MN_NONE) { return pars_yield_error(self, mnemonic_id, E_MNEMONIC); } if (arg2->type != ARG_NONE) { assert(arg1->type != ARG_NONE); } if (arg1->type == ARG_NONE) { assert(arg1->type == ARG_NONE); } const enum args_count args_count = get_args_count_for_mnemonic(mnemonic); // Validate instruction arguments count switch (args_count) { case ARGS_COUNT_UNKNOWN: UNREACHABLE(); break; case ARGS_COUNT_0: if (arg1->type != ARG_NONE) { return pars_yield_error_msg(self, arg1->first_token, E_ARGS_COUNT); } break; case ARGS_COUNT_1: if (arg1->type == ARG_NONE) { return pars_yield_error_msg(self, mnemonic_id, E_ARGS_COUNT); } else if (arg2->type != ARG_NONE) { return pars_yield_error_msg(self, arg2->first_token, E_ARGS_COUNT); } break; case ARGS_COUNT_1_2: if (arg1->type == ARG_NONE) { return pars_yield_error_msg(self, mnemonic_id, E_ARGS_COUNT); } break; case ARGS_COUNT_2: if (arg1->type == ARG_NONE || arg2->type == ARG_NONE) { return pars_yield_error_msg(self, mnemonic_id, E_ARGS_COUNT); } break; } const size_t first_token_id = label_id ? label_id : mnemonic_id; struct stmt stmt = { .type = ST_INSTRUCTION, .instruction = { .mnemonic = mnemonic, .opsize = opsize, .arg1 = arg1 ? *arg1 : (struct arg){0}, .arg2 = arg2 ? *arg2 : (struct arg){0}, }, .label_token = label_id, .comment_token = comment_id, .first_token = first_token_id, .num_tokens = self->cur_tok_id - first_token_id, }; pars_put_stmt(self, &stmt); return OK; } static int pars_parse_instruction_args( struct pars *const self, const size_t label_id, const size_t mnemonic_id, const enum opsize opsize) { struct arg arg1 = {0}, arg2 = {0}; // Try parse first argument const int res1 = pars_parse_arg(self, &arg1); if (res1 != OK) { return res1; } if (arg1.type != ARG_NONE) { if (!pars_is_eof_reached(self)) { const struct token token = pars_peek(self); if (token.type == TT_COMMA) { pars_commit(self); // Try parse second argument if (pars_is_eof_reached(self)) { return pars_yield_error_eof(self, E_ARG); } if (pars_peek(self).type == TT_NEWLINE) { return pars_yield_error(self, self->cur_tok_id, E_ARG); } const int res2 = pars_parse_arg(self, &arg2); if (res2 != OK) { return res2; } } else if (token.type != TT_COMMENT_SEMICOLON && token.type != TT_NEWLINE) { return pars_yield_error(self, self->cur_tok_id, E_INSTR_END); } } } // Finish parsing instruction, expect comment or newline size_t comment_id = 0; const int ret = pars_parse_comment_and_newline(self, &comment_id); if (ret != OK) { return ret; } return pars_yield_instruction( self, label_id, comment_id, mnemonic_id, opsize, &arg1, &arg2); } static int pars_parse_instruction( struct pars *const self, const size_t label_id, const size_t mnemonic_id) { if (pars_is_eof_reached(self)) { return pars_yield_error_eof(self, E_MNEMONIC); } const struct token size_spec = pars_peek(self); if (size_spec.type == TT_DOT_ID) { const size_t size_spec_id = pars_commit(self); // Size specifier if (size_spec.length != 2) { return pars_yield_error(self, size_spec_id, E_INSN_SIZE_SPEC); } const enum opsize opsize = get_opsize_from_specifier(self->lex->input[size_spec.offset + 1]); if (opsize == OPSIZE_NONE) { return pars_yield_error(self, size_spec_id, E_INSN_SIZE_SPEC); } return pars_parse_instruction_args(self, label_id, mnemonic_id, opsize); } return pars_parse_instruction_args( self, label_id, mnemonic_id, OPSIZE_NONE); } static int pars_parse_assignment( struct pars *const self, const size_t label_id, const size_t symbol_id) { // TODO (void) label_id; (void) symbol_id; return pars_yield_error_msg(self, self->cur_tok_id, E_NIMPL); } static int pars_yield_label_comment( struct pars *const self, const size_t label_id, const size_t comment_id) { if (label_id || comment_id) { const size_t first_token = label_id ? label_id : comment_id; struct stmt stmt = { .type = label_id ? ST_LABEL : ST_COMMENT, .label_token = label_id, .comment_token = comment_id, .first_token = first_token, .num_tokens = self->cur_tok_id - first_token, }; pars_put_stmt(self, &stmt); } return OK; } static int pars_parse_labeled_statement( struct pars *const self, const size_t label_id) { const struct token token1 = pars_peek(self); const bool is_comment = token1.type == TT_COMMENT_ASTERISK || token1.type == TT_COMMENT_SEMICOLON; if (is_comment) { return pars_yield_label_comment(self, label_id, pars_commit(self)); } else if (token1.type == TT_NEWLINE) { pars_commit(self); return pars_yield_label_comment(self, label_id, 0); } else if (token1.type == TT_ID) { const size_t token1_id = pars_commit(self); if (pars_is_eof_reached(self)) { return pars_yield_error_eof(self, E_LABELED_STMT); } const struct token token2 = pars_peek(self); if (!label_id && token2.type == TT_COLON) { pars_commit(self); return pars_parse_labeled_statement(self, token1_id); } else if (token2.type == TT_EQ || token2.type == TT_EQ_DOUBLE) { pars_commit(self); return pars_parse_assignment(self, label_id, token1_id); } return pars_parse_instruction(self, label_id, token1_id); } else if (token1.type == TT_DOT_ID) { return pars_parse_direc(self, label_id); } return pars_yield_error(self, self->cur_tok_id, E_STMT_BEGIN); } static int pars_parse_statement(struct pars *const self) { return pars_parse_labeled_statement(self, 0); } /** Run parser until the end of the input reached * \returns OK if parsing finished successfully * \returns ERR if error encountered and parsing cannot continue. */ static int pars_run(struct pars *const self) { // Skip dummy token at position 0 self->cur_tok_id = 1; // Leave dummy statement at position 0 pars_put_stmt(self, &(struct stmt){0}); int ret = OK; while (self->cur_tok_id < self->lex->tokens_count) { ret = pars_parse_statement(self); if (ret != OK) { // Don't really care about parsing errors right now ret = OK; } } fflush(self->stmttab_stream); fflush(self->symtab_stream); fflush(self->symbuf_stream); return ret; } static void pars_destroy(struct pars *const self) { fclose(self->stmttab_stream); free(self->stmttab); fclose(self->symtab_stream); free(self->symtab); fclose(self->symbuf_stream); free(self->symbuf); } static bool is_bcc(const enum mnemonic mnemonic) { switch (mnemonic) { case MN_BRA: case MN_BSR: case MN_BCC: case MN_BCS: case MN_BEQ: case MN_BGE: case MN_BGT: case MN_BHI: case MN_BLE: case MN_BLS: case MN_BLT: case MN_BMI: case MN_BNE: case MN_BPL: case MN_BVC: case MN_BVS: return true; default: break; } return false; } static int assem_init(struct assem *const self, const struct pars *const pars) { *self = (struct assem){ .pars = pars, }; return OK; } static int assem_resolve(struct assem *const self) { (void) self; return OK; } static void emit_token_id( const struct lex *const lex, const struct token *const token, FILE *const s) { assert(token->type == TT_ID); if (lex->input[token->offset] == '_') { // Strip leading underscore // FIXME It should be sort of an option that may be disabled. fprintf(s, "%.*s", (int)token->length - 1, lex->input + token->offset + 1); } else if (lex->input[token->offset] == 'L') { // Add leading dot to all local symbols (they start with 'L'). // FIXME It should be sort of an option that may be disabled. fprintf(s, ".%.*s", (int)token->length, lex->input + token->offset); } else { fprintf(s, "%.*s", (int)token->length, lex->input + token->offset); } } static void emit_expr( const struct lex *const lex, const struct expr *const expr, FILE *const s) { for (size_t i = 0; i < expr->num_tokens; i++) { const struct token token = lex->tokbuf[expr->first_token + i]; if (token.type == TT_NEWLINE) { break; } else if (token.type == TT_ID) { emit_token_id(lex, &token, s); } else { fprintf(s, "%.*s", (int)token.length, lex->input + token.offset); } } } static void emit_arg( const struct lex *const lex, const struct arg *const arg, FILE *const s) { switch (arg->type) { case ARG_DN: fprintf(s, "%%d%d", arg->xn); break; case ARG_AN: fprintf(s, "%%a%d", arg->xn); break; case ARG_AN_ADDR: fprintf(s, "%%a%d@", arg->xn); break; case ARG_AN_ADDR_INCR: fprintf(s, "%%a%d@+", arg->xn); break; case ARG_AN_ADDR_DECR: fprintf(s, "%%a%d@-", arg->xn); break; case ARG_AN_ADDR_16: fprintf(s, "%%a%d@(", arg->xn); emit_expr(lex, &arg->expr, s); fprintf(s, ")"); break; case ARG_AN_ADDR_8_XI: fprintf(s, "%%a%d@(", arg->xn); emit_expr(lex, &arg->expr, s); { const char reg_type = arg->xi & 0x8 ? 'a' : 'd'; const char size = arg->briefext_size == OPSIZE_L ? 'l' : 'w'; fprintf(s, ",%%%c%d:%c)", reg_type, arg->xi & 0x7, size); } break; case ARG_ADDR_WORD: emit_expr(lex, &arg->expr, s), fprintf(s, ":w"); break; case ARG_ADDR_LONG: emit_expr(lex, &arg->expr, s), fprintf(s, ":l"); break; case ARG_ADDR_UNSPEC: emit_expr(lex, &arg->expr, s); break; case ARG_PC_ADDR_16: fprintf(s, "%%pc@("); emit_expr(lex, &arg->expr, s); fprintf(s, ")"); break; case ARG_PC_ADDR_8_XI: fprintf(s, "%%pc@("); emit_expr(lex, &arg->expr, s); { const char reg_type = arg->xi & 0x8 ? 'a' : 'd'; const char size = arg->briefext_size == OPSIZE_L ? 'l' : 'w'; fprintf(s, ",%%%c%d:%c)", reg_type, arg->xi & 0x7, size); } break; case ARG_IMMEDIATE: fprintf(s, "#"), emit_expr(lex, &arg->expr, s); break; case ARG_REGMASK: { // TODO make it concise bool leading_space = false; for (unsigned i = 0; i < 8; i++) { if (arg->regmask & (1 << i)) { fprintf(s, "%s%%d%d", leading_space ? "/" : "", i); leading_space = true; } } for (unsigned i = 0; i < 8; i++) { if (arg->regmask & (1 << (i + 8))) { fprintf(s, "%s%%a%d", leading_space ? "/" : "", i); leading_space = true; } } } break; case ARG_SR: fprintf(s, "%%sr"); break; case ARG_CCR: fprintf(s, "%%ccr"); break; case ARG_USP: fprintf(s, "%%usp"); break; case ARG_NONE: break; } } static void emit_directive_same( const struct lex *const lex, const struct directive *const dir, FILE *const s) { const struct token name_token = lex->tokbuf[dir->name_token]; fprintf(s, "\t%.*s", (int)name_token.length, lex->input + name_token.offset); if (dir->num_tokens) { fprintf(s, "\t"); } for (size_t i = 0; i < dir->num_tokens; i++) { const struct token token = lex->tokbuf[dir->first_token + i]; if (i != 0) { fprintf(s, " "); } if (token.type == TT_ID) { emit_token_id(lex, &token, s); } else { fprintf(s, "%.*s", (int)token.length, lex->input + token.offset); } } } static void emit_directive_byte( const struct lex *const lex, const struct directive *const dir, FILE *const s) { if (dir->num_tokens < 1) { // We won't emit this because it is invalid return; } if (lex->tokbuf[dir->first_token].type == TT_STRING) { fprintf(s, "\t.ascii\t"); } else { fprintf(s, "\t.byte\t"); } for (size_t i = 0; i < dir->num_tokens; i++) { const struct token token = lex->tokbuf[dir->first_token + i]; if (i != 0) { fprintf(s, " "); } if (token.type == TT_ID) { emit_token_id(lex, &token, s); fprintf(s, " "); } else { fprintf(s, "%.*s", (int)token.length, lex->input + token.offset); } } } static void emit_directive_short( const struct lex *const lex, const struct directive *const dir, FILE *const s) { if (dir->num_tokens < 1) { // We won't emit this because it is invalid return; } fprintf(s, "\t.short\t"); for (size_t i = 0; i < dir->num_tokens; i++) { const struct token token = lex->tokbuf[dir->first_token + i]; if (i != 0) { fprintf(s, " "); } if (token.type == TT_ID) { emit_token_id(lex, &token, s); } else { fprintf(s, "%.*s", (int)token.length, lex->input + token.offset); } } } static void emit_directive_size_of_function( const struct lex *const lex, const size_t function_name_token_id, FILE *const s) { const struct token name_token = lex->tokbuf[function_name_token_id]; const char *name = lex->input + name_token.offset; int len = name_token.length; if (*name == '_') { // Strip leading underscore // FIXME It should be sort of an option that may be disabled. name++; len--; } fprintf(s, "\t.size\t%.*s, .-%.*s", len, name, len, name); } static void emit_directive_type( const struct lex *const lex, const size_t function_name_token_id, const char *const type_str, FILE *const s) { const struct token name_token = lex->tokbuf[function_name_token_id]; const char *name = lex->input + name_token.offset; int len = name_token.length; if (*name == '_') { // Strip leading underscore // FIXME It should be sort of an option that may be disabled. name++; len--; } fprintf(s, "\t.type\t%.*s, @%s", len, name, type_str); } static enum opsize assem_resolve_bcc( struct assem *const self, const size_t stmt_number) { const struct pars *const pars = self->pars; const struct stmt *const stmt = pars->stmttab + stmt_number; const struct instruction *const instr = &stmt->instruction; const struct arg *const arg = &instr->arg1; // XXX I'm not sure what else it can be, so let's catch it with an assert assert(arg->type == ARG_ADDR_UNSPEC); // Usually it is just a label - a single token // TODO Impl support of expressions for real assert(arg->expr.num_tokens); const struct token *const target_token = pars->lex->tokbuf + arg->expr.first_token; assert(target_token->type == TT_DOT_ID || target_token->type == TT_ID); bool found = false; uint32_t found_addr = 0; for (size_t i = 0; i < pars->symtab_size; i++) { const struct sym *const sym = pars->symtab + i; if (sym->addr >= stmt->addr) { // Current Bcc statement address reached or surpassed. And we are // not interested in it if it is located either somewhere after // current Bcc statement or in another translation unit. break; } const struct stmt *const target = pars->stmttab + sym->stmt_id; const struct token *const label_token = pars->lex->tokbuf + target->label_token; const bool matches = 0 == memcmp( pars->lex->input + label_token->offset, pars->lex->input + target_token->offset, target_token->length); if (matches) { found = true; found_addr = sym->addr; } } if (found && ((stmt->addr + 2 - found_addr) <= BCC_S_MAX_BACKWARDS)) { return OPSIZE_S; } // Label is not found, therefore current Bcc size must be as wide as // possible (word). This is original behavior of Sierra's ASM68.EXE. return OPSIZE_W; } static struct res { uint32_t value; bool ok; } assem_find_symbol( const struct assem *const self, const size_t stmt_id) { for (size_t i = 0; i < self->pars->symtab_size / (sizeof *self->pars->symtab); i++) { if ((self->pars->symtab + i)->stmt_id == stmt_id) { return (struct res){ .value = (self->pars->symtab + i)->addr, .ok = true, }; } } return (struct res){ .ok = false }; } static void assem_emit_meta_sat( struct assem *const self, const struct sat *const sat, FILE *const stream) { const struct lex *const lex = self->pars->lex; if (0 == sat->val_arg.first_token) { return; } if (sat->scl_arg.value == -1) { emit_directive_size_of_function( lex, sat->def_arg.first_token, stream); fprintf(stream, "\n\n"); } // Check if first derived type (2 bits at offset 4) is function if (((sat->type_arg.value >> 4) & 3) == DT_FCN) { emit_directive_type( lex, sat->def_arg.first_token, "function", stream); fprintf(stream, "\n"); } } static int assem_emit(struct assem *const self, FILE *const stream) { const struct lex *const lex = self->pars->lex; const struct pars *const pars = self->pars; if (TRACE_LEXER) { for (size_t i = 1; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) { fprint_token_debug(lex->input, &lex->tokbuf[i], stream); } } if (TRACE_PARSER) { for (size_t i = 1; i < pars->stmttab_size / (sizeof *pars->stmttab); i++) { fprint_stmt_debug(lex, pars->stmttab + i, stream); } } for (size_t i = 1; i < pars->stmttab_size / (sizeof *pars->stmttab); i++) { bool line_is_empty = true; const struct stmt *stmt = pars->stmttab + i; if (stmt->label_token) { const struct token token = lex->tokbuf[stmt->label_token]; if (token.type == TT_ID) { emit_token_id(lex, &token, stream); fprintf(stream, ":"); } else { fprintf(stream, "%.*s:", (int)token.length, lex->input + token.offset); } line_is_empty = false; } if (stmt->type == ST_INSTRUCTION) { const struct instruction instr = stmt->instruction; fprintf(stream, "\t%s", mnemonic_to_string(instr.mnemonic)); if (instr.opsize != OPSIZE_NONE) { fprintf(stream, "%c", opsize_to_char(instr.opsize)); } else if (is_bcc(instr.mnemonic)) { enum opsize opsize = assem_resolve_bcc(self, i); fprintf(stream, "%c", opsize_to_char(opsize)); } if (instr.arg1.type != ARG_NONE) { fprintf(stream, " "); emit_arg(lex, &instr.arg1, stream); if (instr.mnemonic == MN_JMP || instr.mnemonic == MN_JSR) { if (instr.arg1.type == ARG_ADDR_UNSPEC) { fprintf(stream, ":l"); } } if (instr.arg2.type != ARG_NONE) { fprintf(stream, ","); emit_arg(lex, &instr.arg2, stream); } } line_is_empty = false; } else if (stmt->type == ST_DIRECTIVE) { const struct directive *dir = &stmt->directive; switch (dir->type) { case DT_ALIGN: case DT_FILE: case DT_GLOBL: case DT_TEXT: emit_directive_same(lex, dir, stream); line_is_empty = false; break; case DT_ASCII: case DT_BYTE: emit_directive_byte(lex, dir, stream); line_is_empty = false; break; case DT_SHORT: case DT_WORD: emit_directive_short(lex, dir, stream); line_is_empty = false; break; default: break; } } if (stmt->comment_token) { const struct token token = lex->tokbuf[stmt->comment_token]; fprintf(stream, " |%.*s", (int)token.length - 1, lex->input + token.offset + 1); } if (stmt->label_token) { struct res res = assem_find_symbol(self, i); if (res.ok) { fprintf(stream, " | @%08x", res.value); } } if (!line_is_empty) { fprintf(stream, "\n"); } if (stmt->type == ST_META_SAT) { assem_emit_meta_sat(self, &stmt->sat, stream); } } return OK; } static void assem_destroy(struct assem *const self) { (void) self; } int main(const int argc, char *const argv[]) { // No fucks given about arguments for now (void)argc; (void)argv; struct lex lex; struct pars pars; if (OK != lex_init(&lex)) { return EXIT_FAILURE; } // Tokenize assembly program text if (OK != lex_run(&lex, stdin)) { lex_destroy(&lex); return EXIT_FAILURE; } // Parser needs final lexer state to access parsed tokens and input data if (OK != pars_init(&pars, &lex)) { lex_destroy(&lex); return EXIT_FAILURE; } // Parse assembly program text if (OK != pars_run(&pars)) { pars_destroy(&pars); lex_destroy(&lex); return EXIT_FAILURE; } struct assem assem; // Allocate and populate code table and metadata table from parsed data. // Assembler needs parser's and lexer's final state to access parsed // structure, tokens and input. if (OK != assem_init(&assem, &pars)) { pars_destroy(&pars); lex_destroy(&lex); return EXIT_FAILURE; } // Resolve all ambiguities if (OK != assem_resolve(&assem)) { assem_destroy(&assem); pars_destroy(&pars); lex_destroy(&lex); return EXIT_FAILURE; } // Emit unambiguous assembly language program text for specified dialect // (currently m68k GNU AS only is supported) if (OK != assem_emit(&assem, stdout)) { assem_destroy(&assem); pars_destroy(&pars); lex_destroy(&lex); return EXIT_FAILURE; } assem_destroy(&assem); pars_destroy(&pars); lex_destroy(&lex); }