/* SPDX-License-Identifier: Unlicense * * This program translates Sierra m68k assembly dialect to GNU AS m68k dialect. * * NOTE: Unicode is not supported, ASCII only. */ #include #include #include #include #include #include #include #ifndef TRACE_LEX #define TRACE_LEX 1 #endif #define ERR 0 #define OK 1 #define CONTINUE 2 enum token_type { TT_NONE = 0, TT_NEWLINE, TT_ESCAPE, TT_DOT, TT_COMMA, TT_PLUS, TT_MINUS, TT_ASTERISK, TT_SLASH, TT_EQ, TT_EQ_DOUBLE, TT_COLON, TT_PERCENT, TT_LEFT_SHIFT, TT_RIGHT_SHIFT, TT_HASH, TT_BANG, TT_DOLLAR, TT_TILDE, TT_AMPERSAND, TT_PIPE, TT_CAP, TT_STRING, TT_IDENTIFIER, TT_NUMDEC, TT_NUMOCT, TT_NUMHEX, TT_LPAREN, TT_RPAREN, TT_LBRACKET, TT_RBRACKET, TT_RBRACE, TT_LBRACE, TT_COMMENT_ASTERISK, TT_COMMENT_SEMICOLON, }; struct token { enum token_type type; size_t offset; size_t length; }; enum lex_error { LE_NONE = 0, LE_SOME, }; enum lex_state { LS_FREE = 0, LS_CR, LS_LEFT_SHIFT, LS_RIGHT_SHIFT, LS_EQ, LS_IDENTIFIER, LS_NUMOCTHEX, LS_NUMOCT, LS_NUMHEX, LS_NUMDEC, LS_STRING, LS_STRING_ESC, LS_COMMENT_ASTERISK, LS_COMMENT_SEMICOLON, LS_ERROR, LS_EOF, }; struct line_pos_info { unsigned long line_num; unsigned long column_num; unsigned long line_offset; }; struct lex { // State variables enum lex_state state; enum lex_error error; size_t cursor; size_t tok_offset; bool inside_line; // Input data buffer FILE *input_stream; char *input; size_t input_size; // Tokens table FILE *tokbuf_stream; struct token *tokbuf; size_t tokbuf_size; }; enum stmt_kind { SK_NONE = 0, SK_LABEL, SK_INSTRUCTION, SK_TEXT, SK_DIR_FILE, SK_DIR_TEXT, SK_DIR_ALIGN, SK_DIR_DEF_ENDEF, SK_DIR_GLOBL, SK_DIR_LINE, }; enum mnemonic { OPCODE_NONE, OPCODE_NOP, }; enum opsize { OPSIZE_NONE = 0, OPSIZE_S, OPSIZE_B, OPSIZE_W, OPSIZE_L, }; enum arg_kind { ARG_NONE = 0, ARG_DN, ARG_AN, ARG_AN_ADDR, ARG_AN_ADDR_INCR, ARG_AN_ADDR_DECR, ARG_AN_ADDR_16, ARG_AN_ADDR_8_XN, ARG_ADDR_WORD, ARG_ADDR_LONG, ARG_ADDR_UNSPEC, ARG_PC_ADDR_16, ARG_PC_ADDR_8_XN, ARG_IMMEDIATE, }; struct arg_8_xn { int8_t val; int8_t an; int8_t xi; }; struct instruction { enum mnemonic mnemonic; enum opsize opsize; enum arg_kind arg1_kind, arg2_kind; union { int32_t imm, addr; struct arg_8_xn arg_8_xn; // For (d,An,Xi) and (d,PC,Xn) } arg1, arg2; }; struct def_endef { size_t sym_id; size_t tag_sym_id; int32_t size; int32_t storage_class; int32_t type; }; struct stmt { enum stmt_kind type; union { struct instruction instruction; int32_t align; size_t globl_sym_id; size_t file_sym_id; }; size_t first_token, num_tokens; // Statement tokens span, may be NULL size_t comment_token; }; struct symbol { size_t offset; // Byte offset in continuous null terminated symbol buffer // Instead of strcmp every item in symtab we can compare hashes and get O(N) // for search. uint32_t hash; }; enum pars_error { PE_NONE = 0, PE_LEX, PE_SOME, }; struct pars { const struct lex *lex; // State size_t cur_tok_id; enum pars_error error; // Statement table FILE *stmttab_stream; struct stmt *stmttab; size_t stmttab_size; // Symbol table FILE *symtab_stream; struct sym *symtab; size_t symtab_size; // Symbol buffer for symbol table FILE *symbuf_stream; char *symbuf; size_t symbuf_size; }; struct assem { const struct pars *pars; }; const char *const g_escape_table[256] = { "\\x00", "\\x01", "\\x02", "\\x03", "\\x04", "\\x05", "\\x06", "\\x07", "\\x08", "\\t", "\\n", "\\x0b", "\\x0c", "\\r", "\\x0e", "\\x0f", "\\x10", "\\x11", "\\x12", "\\x13", "\\x14", "\\x15", "\\x16", "\\x17", "\\x18", "\\x19", "\\x1a", "\\x1b", "\\x1c", "\\x1d", "\\x1e", "\\x1f", " ", "!", "\\\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "\\<", "=", "\\>", "?", "@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\\\", "]", "^", "_", "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "{", "|", "}", "~", "\\x7f", "\\x80", "\\x81", "\\x82", "\\x83", "\\x84", "\\x85", "\\x86", "\\x87", "\\x88", "\\x89", "\\x8a", "\\x8b", "\\x8c", "\\x8d", "\\x8e", "\\x8f", "\\x90", "\\x91", "\\x92", "\\x93", "\\x94", "\\x95", "\\x96", "\\x97", "\\x98", "\\x99", "\\x9a", "\\x9b", "\\x9c", "\\x9d", "\\x9e", "\\x9f", "\\xa0", "\\xa1", "\\xa2", "\\xa3", "\\xa4", "\\xa5", "\\xa6", "\\xa7", "\\xa8", "\\xa9", "\\xaa", "\\xab", "\\xac", "\\xad", "\\xae", "\\xaf", "\\xb0", "\\xb1", "\\xb2", "\\xb3", "\\xb4", "\\xb5", "\\xb6", "\\xb7", "\\xb8", "\\xb9", "\\xba", "\\xbb", "\\xbc", "\\xbd", "\\xbe", "\\xbf", "\\xc0", "\\xc1", "\\xc2", "\\xc3", "\\xc4", "\\xc5", "\\xc6", "\\xc7", "\\xc8", "\\xc9", "\\xca", "\\xcb", "\\xcc", "\\xcd", "\\xce", "\\xcf", "\\xd0", "\\xd1", "\\xd2", "\\xd3", "\\xd4", "\\xd5", "\\xd6", "\\xd7", "\\xd8", "\\xd9", "\\xda", "\\xdb", "\\xdc", "\\xdd", "\\xde", "\\xdf", "\\xe0", "\\xe1", "\\xe2", "\\xe3", "\\xe4", "\\xe5", "\\xe6", "\\xe7", "\\xe8", "\\xe9", "\\xea", "\\xeb", "\\xec", "\\xed", "\\xee", "\\xef", "\\xf0", "\\xf1", "\\xf2", "\\xf3", "\\xf4", "\\xf5", "\\xf6", "\\xf7", "\\xf8", "\\xf9", "\\xfa", "\\xfb", "\\xfc", "\\xfd", "\\xfe", }; static bool should_be_escaped(const int c) { return c < ' ' || c == '"' || c == '\\' || c == '<' || c == '>' || c > '~'; } static bool is_oct(const int c) { return c >= '0' && c <= '7'; } static bool is_dec(const int c) { return c >= '0' && c <= '9'; } static bool is_hex(const int c) { return is_dec(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } static bool is_alphabetic(const int c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } static bool is_alphanum(const int c) { return is_dec(c) || is_alphabetic(c); } static int printed_size(const char c) { if (c < ' ' || c > '~') { return sizeof("\\x00")-1; } if (c == '"' || c == '\\') { return sizeof("\\\\")-1; } return 1; } static int fprint_string_escaped( const char *const str, const size_t length, FILE *const stream) { int written = 0; for (size_t i = 0; i < length; i++, written += printed_size(str[i])) { if (should_be_escaped(str[i])) { fputs(g_escape_table[(unsigned char)str[i]], stream); } else { fputc(str[i], stream); } } return written; } static const char *tok_kind_to_string(const enum token_type type) { switch (type) { case TT_NONE: return "NONE"; case TT_NEWLINE: return "NEWLINE"; case TT_ESCAPE: return "ESCAPE"; case TT_DOT: return "DOT"; case TT_COMMA: return "COMMA"; case TT_PLUS: return "PLUS"; case TT_MINUS: return "MINUS"; case TT_ASTERISK: return "ASTERISK"; case TT_SLASH: return "SLASH"; case TT_EQ: return "EQ"; case TT_EQ_DOUBLE: return "EQ_DOUBLE"; case TT_COLON: return "COLON"; case TT_PERCENT: return "PERCENT"; case TT_LEFT_SHIFT: return "LEFT_SHIFT"; case TT_RIGHT_SHIFT: return "RIGHT_SHIFT"; case TT_HASH: return "HASH"; case TT_BANG: return "BANG"; case TT_DOLLAR: return "DOLLAR"; case TT_TILDE: return "TILDE"; case TT_AMPERSAND: return "AMPERSAND"; case TT_PIPE: return "PIPE"; case TT_CAP: return "CAP"; case TT_STRING: return "STRING"; case TT_IDENTIFIER: return "IDENTIFIER"; case TT_NUMDEC: return "NUMDEC"; case TT_NUMOCT: return "NUMOCT"; case TT_NUMHEX: return "NUMHEX"; case TT_LPAREN: return "LPAREN"; case TT_RPAREN: return "RPAREN"; case TT_LBRACKET: return "LBRACKET"; case TT_RBRACKET: return "RBRACKET"; case TT_LBRACE: return "LBRACE"; case TT_RBRACE: return "RBRACE"; case TT_COMMENT_ASTERISK: return "COMMENT"; case TT_COMMENT_SEMICOLON: return "COMMENT"; } assert(0); return "UNKNOWN"; } static int fprint_tok(const char *const input, struct token *token, FILE *const stream) { int res = fprintf(stream, "%s<", tok_kind_to_string(token->type)); if (res == -1) { return -1; } int written = res; res = fprint_string_escaped(input + token->offset, token->length, stream); if (res == -1) { return -1; } written += res; res = fputs(">\n", stream); if (res == -1) { return -1; } written += res; return written; } static int lex_init(struct lex *const self) { *self = (struct lex){ .input_stream = open_memstream(&self->input, &self->input_size), .tokbuf_stream = open_memstream( (char **)&self->tokbuf, &self->tokbuf_size), }; assert(self->input_stream != NULL); assert(self->tokbuf_stream != NULL); return OK; } static int fwrite_token(const struct token *const token, FILE *const stream) { const int res = fwrite(token, sizeof *token, 1, stream); assert(res == 1); return res; } static void lex_yield_token(struct lex *const self, const struct token *const token) { self->inside_line = (token->type != TT_NEWLINE) && (token->type != TT_ESCAPE); fwrite_token(token, self->tokbuf_stream); } static const char *lex_state_error_string( const enum lex_state state, const bool inside_line) { if (!inside_line) { assert(state == LS_FREE); return "'*', ';', '0', '[1-9]', '[a-zA-Z_]', ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; } switch (state) { case LS_FREE: return "';', '0', '[1-9]', '[a-zA-Z_]', ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_NUMOCTHEX: return "';', '[0-7]', [xX], ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_NUMOCT: return "';', '[0-7]' , ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_NUMHEX: return "';', '[0-9a-zA-Z]' , ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_NUMDEC: return "';', '[0-9]' , ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; case LS_LEFT_SHIFT: return "'<'"; case LS_RIGHT_SHIFT: return "'>'"; case LS_CR: case LS_EQ: case LS_IDENTIFIER: case LS_STRING: case LS_STRING_ESC: case LS_COMMENT_ASTERISK: case LS_COMMENT_SEMICOLON: case LS_ERROR: case LS_EOF: assert(0); break; } return "???"; } static struct line_pos_info lex_get_line_pos_info(const struct lex *const self) { struct line_pos_info l = {0, 0, 0}; bool cr = false; // `input` is null terminated, that's why we subtract 1 here for (size_t i = 0; i < self->input_size - 1; i++) { const char c = self->input[i]; if (c == '\r') { cr = true; l.line_offset = i + 1; l.line_num++; l.column_num = 0; } else if (c == '\n') { if (!cr) { l.line_num++; } cr = false; l.line_offset = i + 1; l.column_num = 0; } else { cr = false; l.column_num++; } } return l; } static int lex_yield_error(struct lex *const self, const int c) { fflush(self->input_stream); const struct line_pos_info l = lex_get_line_pos_info(self); const size_t cursor = self->cursor; { // Read out the rest of the line int c; do { c = getc(stdin); const char c_char = (c == EOF) ? 0 : c; fwrite(&c_char, sizeof c_char, 1, self->input_stream); } while (c != EOF && c != '\n' && c != '\r'); fflush(self->input_stream); } const unsigned char c_char = (c == EOF) ? 0 : c; fprintf( stderr, ":%lu:%lu: lexing error: expected %s, found '", l.line_num + 1, l.column_num + 1, lex_state_error_string(self->state, self->inside_line)); fputs(g_escape_table[c_char], stderr); fputs("'\n", stderr); fprintf(stderr, "%5lu | %s\n", l.line_num + 1, self->input + l.line_offset); fputs(" | ", stderr); for (size_t i = 0; i < l.column_num; i++) { if (self->input[l.line_offset + i] == '\t') { fputc('\t', stderr); } else { fputc(' ', stderr); } } fputs("^\n", stderr); fprintf(stderr, ": %lu bytes parsed\n", cursor); self->state = LS_ERROR; return ERR; } static int lex_handle_next(struct lex *const self, const int c) { switch (self->state) { case LS_FREE: if (is_alphabetic(c) || c == '_') { self->tok_offset = self->cursor; self->state = LS_IDENTIFIER; } else if (c == '0') { self->tok_offset = self->cursor; self->state = LS_NUMOCTHEX; } else if (is_dec(c)) { self->tok_offset = self->cursor; self->state = LS_NUMDEC; } else if (c == '@') { self->tok_offset = self->cursor; self->state = LS_NUMOCT; } else if (c == '"') { self->tok_offset = self->cursor; self->state = LS_STRING; } else if (c == ';') { self->tok_offset = self->cursor; self->state = LS_COMMENT_SEMICOLON; } else if (c == '<') { self->tok_offset = self->cursor; self->state = LS_LEFT_SHIFT; } else if (c == '>') { self->tok_offset = self->cursor; self->state = LS_RIGHT_SHIFT; } else if (c == ',') { lex_yield_token(self, &(struct token){TT_COMMA, self->cursor, 1}); } else if (c == '.') { lex_yield_token(self, &(struct token){TT_DOT, self->cursor, 1}); } else if (c == '(') { lex_yield_token(self, &(struct token){TT_LPAREN, self->cursor, 1}); } else if (c == ')') { lex_yield_token(self, &(struct token){TT_RPAREN, self->cursor, 1}); } else if (c == '[') { lex_yield_token(self, &(struct token){TT_LBRACKET, self->cursor, 1}); } else if (c == ']') { lex_yield_token(self, &(struct token){TT_RBRACKET, self->cursor, 1}); } else if (c == '{') { lex_yield_token(self, &(struct token){TT_LBRACE, self->cursor, 1}); } else if (c == '{') { lex_yield_token(self, &(struct token){TT_RBRACE, self->cursor, 1}); } else if (c == '+') { lex_yield_token(self, &(struct token){TT_PLUS, self->cursor, 1}); } else if (c == '-') { lex_yield_token(self, &(struct token){TT_MINUS, self->cursor, 1}); } else if (c == '*') { if (self->inside_line) { lex_yield_token( self, &(struct token){TT_ASTERISK, self->cursor, 1}); } else { self->tok_offset = self->cursor; self->state = LS_COMMENT_ASTERISK; } } else if (c == '/') { lex_yield_token(self, &(struct token){TT_SLASH, self->cursor, 1}); } else if (c == '=') { self->tok_offset = self->cursor; self->state = LS_EQ; } else if (c == ':') { lex_yield_token(self, &(struct token){TT_COLON, self->cursor, 1}); } else if (c == '%') { lex_yield_token(self, &(struct token){TT_PERCENT, self->cursor, 1}); } else if (c == '#') { lex_yield_token(self, &(struct token){TT_HASH, self->cursor, 1}); } else if (c == '!') { lex_yield_token(self, &(struct token){TT_BANG, self->cursor, 1}); } else if (c == '$') { lex_yield_token(self, &(struct token){TT_DOLLAR, self->cursor, 1}); } else if (c == '~') { lex_yield_token(self, &(struct token){TT_TILDE, self->cursor, 1}); } else if (c == '&') { lex_yield_token(self, &(struct token){TT_AMPERSAND, self->cursor, 1}); } else if (c == '|') { lex_yield_token(self, &(struct token){TT_PIPE, self->cursor, 1}); } else if (c == '^') { lex_yield_token(self, &(struct token){TT_CAP, self->cursor, 1}); } else if (c == '\r') { self->tok_offset = self->cursor; self->state = LS_CR; } else if (c == '\n') { lex_yield_token(self, &(struct token){TT_NEWLINE, self->cursor, 1}); } else if (c == '\\') { lex_yield_token(self, &(struct token){TT_ESCAPE, self->cursor, 1}); } else if (c == ' ' || c == '\t') { // ignore spaces and tabs } else if (c == EOF) { self->state = LS_EOF; } else if (c == '\x1a') { // Ignore "End of file" character } else { return lex_yield_error(self, c); } break; case LS_CR: // Accumulate CRLF into single token { const size_t size = c == '\n' ? 2 : 1; // 2 for CRLF, 1 for just CR const struct token token = {TT_NEWLINE, self->tok_offset, size}; lex_yield_token(self, &token); self->state = LS_FREE; if (c != '\n') { // It is just CR, handle this char in LS_FREE state then return lex_handle_next(self, c); } } break; case LS_LEFT_SHIFT: if (c == '<') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_LEFT_SHIFT, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; } else { return lex_yield_error(self, c); } break; case LS_RIGHT_SHIFT: if (c == '>') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_RIGHT_SHIFT, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; } else { return lex_yield_error(self, c); } break; case LS_EQ: { const size_t length = (c == '=') ? 2 : 1; const enum token_type type = (c == '=') ? TT_EQ_DOUBLE : TT_EQ; const struct token token = {type, self->tok_offset, length}; lex_yield_token(self, &token); } self->state = LS_FREE; if (c != '=') { // It is just single eq "=", handle this char in LS_FREE state then return lex_handle_next(self, c); } break; case LS_IDENTIFIER: if (!is_alphanum(c) && c != '_') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_IDENTIFIER, self->tok_offset, length}; lex_yield_token(self, &token); self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_NUMOCTHEX: if (c == 'x' || c == 'X') { self->state = LS_NUMHEX; } else if (is_oct(c)) { self->state = LS_NUMOCT; } else if (is_alphabetic(c) || c == '_') { return lex_yield_error(self, c); } else { assert((self->cursor - self->tok_offset) == 1); const struct token token = {TT_NUMDEC, self->tok_offset, 1}; lex_yield_token(self, &token); // It was just zero, handle this char in LS_FREE state then self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_NUMOCT: if (is_alphabetic(c) || c == '_') { return lex_yield_error(self, c); } else if (!is_oct(c)) { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_NUMOCT, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_NUMHEX: if (is_hex(c)) { // Keep calm } else if (is_alphabetic(c) || c == '_') { // Panik! return lex_yield_error(self, c); } else { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_NUMHEX, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_NUMDEC: if (is_alphabetic(c) || c == '_') { return lex_yield_error(self, c); } else if (!is_dec(c)) { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_NUMDEC, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_STRING: if (c == '\\') { self->state = LS_STRING_ESC; } else if (c == '"') { const size_t length = self->cursor - self->tok_offset + 1; const struct token token = {TT_STRING, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished self->state = LS_FREE; } break; case LS_STRING_ESC: self->state = LS_STRING; break; case LS_COMMENT_ASTERISK: if (c == '\r' || c == '\n') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_COMMENT_ASTERISK, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_COMMENT_SEMICOLON: if (c == '\r' || c == '\n') { const size_t length = self->cursor - self->tok_offset; const struct token token = {TT_COMMENT_SEMICOLON, self->tok_offset, length}; lex_yield_token(self, &token); // This token is finished, handle this char in LS_FREE state self->state = LS_FREE; return lex_handle_next(self, c); } break; case LS_ERROR: return ERR; case LS_EOF: assert(0); } return CONTINUE; } /** Advance lexer to produce new token. * \returns EOF if end of file reached. * \returns ERR if error encountered and lexing cannot continue. * \returns OK has one or more new tokens parsed. */ static int lex_next(struct lex *const self, FILE *const stream) { for (;; self->cursor++) { const int c = fgetc(stream); const char c_char = (c == EOF) ? 0 : c; fwrite(&c_char, sizeof c_char, 1, self->input_stream); const int ret = lex_handle_next(self, c); if (OK == ret) { return OK; } else if (ERR == ret) { // TODO handle errors return ERR; } if (c == EOF) { break; } } return EOF; } /** Run lexer until the end of the input reached * \returns OK if lexing finished successfully * \returns ERR if error encountered and lexing cannot continue. */ static int lex_run(struct lex *const self, FILE *const stream) { int res; do { res = lex_next(self, stream); if (res == OK) { res = 0; } else if (res == ERR) { return ERR; } } while (res != EOF); fflush(self->input_stream); fflush(self->tokbuf_stream); return OK; } static void lex_destroy(struct lex *const self) { fclose(self->input_stream); free(self->input); fclose(self->tokbuf_stream); free(self->tokbuf); } static int pars_init(struct pars *const self, const struct lex *const lex) { *self = (struct pars){ .lex = lex, .stmttab_stream = open_memstream( (char **)&self->stmttab, &self->stmttab_size), .symbuf_stream = open_memstream(&self->symbuf, &self->symbuf_size), }; assert(self->stmttab_stream != NULL); assert(self->symbuf_stream != NULL); return OK; } static int pars_yield_error(struct pars *const self) { (void) self; // TODO return ERR; } static int pars_parse_label(struct pars *const self) { const struct token label = self->lex->tokbuf[self->cur_tok_id++]; if (label.type != TT_IDENTIFIER) { return pars_yield_error(self); } // TODO return OK; } static int pars_parse_instr(struct pars *const self, const struct token mnemo) { (void) self; (void) mnemo; // TODO return OK; } static int pars_parse_direc(struct pars *const self) { const char *input = self->lex->input; const struct token direc = self->lex->tokbuf[self->cur_tok_id++]; if (direc.type != TT_IDENTIFIER) { return pars_yield_error(self); } if (0 == strcmp(input + direc.offset, "def")) { } else if (0 == strcmp(input + direc.offset, "opt")) { } else if (0 == strcmp(input + direc.offset, "file")) { } else if (0 == strcmp(input + direc.offset, "text")) { } else if (0 == strcmp(input + direc.offset, "align")) { } else if (0 == strcmp(input + direc.offset, "globl")) { } else if (0 == strcmp(input + direc.offset, "ln")) { } else if (0 == strcmp(input + direc.offset, "long")) { } else if (0 == strcmp(input + direc.offset, "word")) { } else if (0 == strcmp(input + direc.offset, "byte")) { } else if (0 == strcmp(input + direc.offset, "bin")) { } // TODO return OK; } static int pars_parse_instr_or_direc(struct pars *const self) { struct token token; if (token.type == TT_DOT) { return pars_parse_direc(self); } else if (token.type == TT_IDENTIFIER) { return pars_parse_instr(self, token); } else { return pars_yield_error(self); } return OK; } static int pars_parse_statement(struct pars *const self) { const size_t tokens_count = self->lex->tokbuf_size / (sizeof *self->lex->tokbuf); const struct token token = self->lex->tokbuf[self->cur_tok_id++]; const bool is_comment = token.type == TT_COMMENT_ASTERISK || token.type == TT_COMMENT_SEMICOLON; if (token.type == TT_IDENTIFIER) { return pars_parse_label(self); } else if (!is_comment) { if (self->cur_tok_id < tokens_count) { const struct token nl = self->lex->tokbuf[self->cur_tok_id++]; assert(nl.type == TT_NEWLINE); } return OK; } else if (token.type == TT_NEWLINE) { return OK; } return pars_yield_error(self); } /** Run parser until the end of the input reached * \returns OK if parsing finished successfully * \returns ERR if error encountered and parsing cannot continue. */ static int pars_run(struct pars *const self) { return OK; const size_t tokens_count = self->lex->tokbuf_size / (sizeof *self->lex->tokbuf); do { const int ret = pars_parse_statement(self); if (ret != OK) { return ret; } } while (self->cur_tok_id < tokens_count); return OK; } static void pars_destroy(struct pars *const self) { fclose(self->stmttab_stream); free(self->stmttab); fclose(self->symbuf_stream); free(self->symbuf); } static int assem_init(struct assem *const self, const struct pars *const pars) { *self = (struct assem){ .pars = pars, }; return OK; } static int assem_resolve(struct assem *const self) { (void) self; return OK; } static int assem_emit(struct assem *const self, FILE *const stream) { if (TRACE_LEX) { const struct lex *const lex = self->pars->lex; for (size_t i = 0; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) { fprint_tok(lex->input, &lex->tokbuf[i], stream); } } return OK; } static void assem_destroy(struct assem *const self) { (void) self; } int main(const int argc, char *const argv[]) { // No fucks given about arguments for now (void)argc; (void)argv; struct lex lex; struct pars pars; if (OK != lex_init(&lex)) { return EXIT_FAILURE; } // Tokenize assembly program text if (OK != lex_run(&lex, stdin)) { lex_destroy(&lex); return EXIT_FAILURE; } // Parser needs final lexer state to access parsed tokens and input data if (OK != pars_init(&pars, &lex)) { lex_destroy(&lex); return EXIT_FAILURE; } // Parse assembly program text if (OK != pars_run(&pars)) { pars_destroy(&pars); lex_destroy(&lex); return EXIT_FAILURE; } struct assem assem; // Allocate and populate code table and metadata table from parsed data. // Assembler needs parser's and lexer's final state to access parsed // structure, tokens and input. if (OK != assem_init(&assem, &pars)) { pars_destroy(&pars); lex_destroy(&lex); return EXIT_FAILURE; } // Resolve all ambiguities if (OK != assem_resolve(&assem)) { assem_destroy(&assem); pars_destroy(&pars); lex_destroy(&lex); return EXIT_FAILURE; } // Emit unambiguous assembly language program text for specified dialect // (currently m68k GNU AS only is supported) if (OK != assem_emit(&assem, stdout)) { assem_destroy(&assem); pars_destroy(&pars); lex_destroy(&lex); return EXIT_FAILURE; } assem_destroy(&assem); pars_destroy(&pars); lex_destroy(&lex); }