summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOxore <oxore@protonmail.com>2023-06-25 12:51:39 +0300
committerOxore <oxore@protonmail.com>2023-06-25 12:52:39 +0300
commitf4666450e21bf4558ace3c93eb474f062a0fda4b (patch)
treed41122c44b77e0104b23b69dfa3b77fe3f3f7dcc
parent19812eab123d347435929dd4f40649b467d3b457 (diff)
Impl basics of instruction parsing
-rw-r--r--Makefile2
-rw-r--r--main.c1024
2 files changed, 918 insertions, 108 deletions
diff --git a/Makefile b/Makefile
index d13a5b3..ef687ba 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
WARNFLAGS = -Wall -Wextra -pedantic -Wlogical-op
INCLUDES = lib
-_FLAGS = -O2 -fsanitize=address
+_FLAGS = -O2 -fsanitize=unreachable -fsanitize=address
_CFLAGS = $(CFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(_FLAGS) -pipe -g
_CXXFLAGS = $(CXXFLAGS) $(WARNFLAGS) $(addprefix -I,$(INCLUDES)) $(_FLAGS) -pipe -g
LDSCRIPTS =
diff --git a/main.c b/main.c
index 48c1d69..2bdadc3 100644
--- a/main.c
+++ b/main.c
@@ -6,6 +6,7 @@
*/
#include <assert.h>
+#include <ctype.h>
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
@@ -13,8 +14,18 @@
#include <stdlib.h>
#include <string.h>
-#ifndef TRACE_LEX
-#define TRACE_LEX 1
+#ifndef TRACE_LEXER
+#define TRACE_LEXER 0
+#endif
+
+#ifndef TRACE_PARSER
+#define TRACE_PARSER 1
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define UNREACHABLE __builtin_unreachable
+#else
+#define UNREACHABLE()
#endif
#define ERR 0
@@ -39,7 +50,6 @@ enum token_type {
TT_RIGHT_SHIFT,
TT_HASH,
TT_BANG,
- TT_DOLLAR,
TT_TILDE,
TT_AMPERSAND,
TT_PIPE,
@@ -112,22 +122,132 @@ struct lex {
size_t tokbuf_size;
};
-enum stmt_kind {
- SK_NONE = 0,
- SK_LABEL,
- SK_INSTRUCTION,
- SK_TEXT,
- SK_DIR_FILE,
- SK_DIR_TEXT,
- SK_DIR_ALIGN,
- SK_DIR_DEF_ENDEF,
- SK_DIR_GLOBL,
- SK_DIR_LINE,
+enum stmt_type {
+ ST_NONE = 0,
+ ST_LABEL,
+ ST_INSTRUCTION,
+ ST_ASSIGNMENT,
+ ST_COMMENT,
+ ST_DIRECTIVE,
};
enum mnemonic {
- OPCODE_NONE,
- OPCODE_NOP,
+ MN_NONE = 0,
+ MN_ABCD,
+ MN_ADD,
+ MN_ADDA,
+ MN_ADDI,
+ MN_ADDQ,
+ MN_ADDX,
+ MN_AND,
+ MN_ANDI,
+ MN_ASL,
+ MN_BRA,
+ MN_BSR,
+ MN_BCC,
+ MN_BCS,
+ MN_BEQ,
+ MN_BGE,
+ MN_BGT,
+ MN_BHI,
+ MN_BLE,
+ MN_BLS,
+ MN_BLT,
+ MN_BMT,
+ MN_BNE,
+ MN_BPL,
+ MN_BVC,
+ MN_BVS,
+ MN_BCHG,
+ MN_BCLR,
+ MN_BSET,
+ MN_CHK,
+ MN_CLR,
+ MN_CMP,
+ MN_CMPA,
+ MN_CMPI,
+ MN_CMPM,
+ MN_DBT,
+ MN_DBF,
+ MN_DBCC,
+ MN_DBCS,
+ MN_DBEQ,
+ MN_DBGE,
+ MN_DBGT,
+ MN_DBHI,
+ MN_DBLE,
+ MN_DBLS,
+ MN_DBLT,
+ MN_DBMT,
+ MN_DBNE,
+ MN_DBPL,
+ MN_DBVC,
+ MN_DBVS,
+ MN_DIVU,
+ MN_DIVS,
+ MN_EORI,
+ MN_EXG,
+ MN_EXT,
+ MN_ILLEGAL,
+ MN_JMP,
+ MN_JSR,
+ MN_LEA,
+ MN_LINK,
+ MN_LSL,
+ MN_LSR,
+ MN_MOVE,
+ MN_MOVEA,
+ MN_MOVEM,
+ MN_MOVEP,
+ MN_MOVEQ,
+ MN_MULS,
+ MN_MULU,
+ MN_NBCD,
+ MN_NEG,
+ MN_NEGX,
+ MN_NOP,
+ MN_NOT,
+ MN_OR,
+ MN_ORI,
+ MN_PEA,
+ MN_RESET,
+ MN_ROL,
+ MN_ROR,
+ MN_ROXL,
+ MN_ROXR,
+ MN_RTE,
+ MN_RTR,
+ MN_RTS,
+ MN_SBCD,
+ MN_ST,
+ MN_SF,
+ MN_SCC,
+ MN_SCS,
+ MN_SEQ,
+ MN_SGE,
+ MN_SGT,
+ MN_SHI,
+ MN_SLE,
+ MN_SLS,
+ MN_SLT,
+ MN_SMT,
+ MN_SNE,
+ MN_SPL,
+ MN_SVC,
+ MN_SVS,
+ MN_STOP,
+ MN_SUB,
+ MN_SUBA,
+ MN_SUBI,
+ MN_SUBQ,
+ MN_SUBX,
+ MN_SWAP,
+ MN_TAS,
+ MN_TRAP,
+ MN_TRAPV,
+ MN_TST,
+ MN_UNLK,
+ MNEMONICS_COUNT,
};
enum opsize {
@@ -138,7 +258,7 @@ enum opsize {
OPSIZE_L,
};
-enum arg_kind {
+enum arg_type {
ARG_NONE = 0,
ARG_DN,
ARG_AN,
@@ -146,29 +266,57 @@ enum arg_kind {
ARG_AN_ADDR_INCR,
ARG_AN_ADDR_DECR,
ARG_AN_ADDR_16,
- ARG_AN_ADDR_8_XN,
+ ARG_AN_ADDR_8_XI,
ARG_ADDR_WORD,
ARG_ADDR_LONG,
ARG_ADDR_UNSPEC,
ARG_PC_ADDR_16,
ARG_PC_ADDR_8_XN,
ARG_IMMEDIATE,
+ ARG_SR,
+ ARG_CCR,
+ ARG_USP,
+ ARG_EXPR,
};
-struct arg_8_xn {
- int8_t val;
+enum args_count {
+ ARGS_COUNT_UNKNOWN = 0,
+ ARGS_COUNT_0,
+ ARGS_COUNT_0_1,
+ ARGS_COUNT_0_1_2,
+ ARGS_COUNT_0_2,
+ ARGS_COUNT_1,
+ ARGS_COUNT_1_2,
+ ARGS_COUNT_2,
+};
+
+struct arg_16 {
+ int16_t d;
+ int8_t an;
+};
+
+struct arg_8 {
+ int8_t d;
int8_t an;
int8_t xi;
};
+union arg_contents {
+ int32_t imm, addr, xn;
+ struct arg_16 arg_16; // For (d16,An) and (d16,PC)
+ struct arg_8 arg_8; // For (d8,An,Xi) and (d8,PC,Xn)
+};
+
+struct arg {
+ enum arg_type arg_type;
+ union arg_contents arg_contents;
+ size_t first_token, num_tokens; // Expression tokens span, may be NULL
+};
+
struct instruction {
enum mnemonic mnemonic;
enum opsize opsize;
- enum arg_kind arg1_kind, arg2_kind;
- union {
- int32_t imm, addr;
- struct arg_8_xn arg_8_xn; // For (d,An,Xi) and (d,PC,Xn)
- } arg1, arg2;
+ struct arg arg1, arg2;
};
struct def_endef {
@@ -180,13 +328,14 @@ struct def_endef {
};
struct stmt {
- enum stmt_kind type;
+ enum stmt_type type;
union {
struct instruction instruction;
int32_t align;
size_t globl_sym_id;
size_t file_sym_id;
};
+ size_t label_token;
size_t first_token, num_tokens; // Statement tokens span, may be NULL
size_t comment_token;
};
@@ -257,6 +406,127 @@ const char *const g_escape_table[256] = {
"\\xfd", "\\xfe",
};
+struct mnemonic_meta {
+ const char *str;
+ enum args_count args_count;
+} g_mnemmonics[MNEMONICS_COUNT] = {
+ { "none", ARGS_COUNT_0 },
+ { "abcd", ARGS_COUNT_2 },
+ { "add", ARGS_COUNT_2 },
+ { "adda", ARGS_COUNT_2 },
+ { "addi", ARGS_COUNT_2 },
+ { "addq", ARGS_COUNT_2 },
+ { "addx", ARGS_COUNT_2 },
+ { "and", ARGS_COUNT_2 },
+ { "andi", ARGS_COUNT_2 },
+ { "asl", ARGS_COUNT_1_2 },
+ { "bra", ARGS_COUNT_1 },
+ { "bsr", ARGS_COUNT_1 },
+ { "bcc", ARGS_COUNT_1 },
+ { "bcs", ARGS_COUNT_1 },
+ { "beq", ARGS_COUNT_1 },
+ { "bge", ARGS_COUNT_1 },
+ { "bgt", ARGS_COUNT_1 },
+ { "bhi", ARGS_COUNT_1 },
+ { "ble", ARGS_COUNT_1 },
+ { "bls", ARGS_COUNT_1 },
+ { "blt", ARGS_COUNT_1 },
+ { "bmt", ARGS_COUNT_1 },
+ { "bne", ARGS_COUNT_1 },
+ { "bpl", ARGS_COUNT_1 },
+ { "bvc", ARGS_COUNT_1 },
+ { "bvs", ARGS_COUNT_1 },
+ { "bchg", ARGS_COUNT_2 },
+ { "bclr", ARGS_COUNT_2 },
+ { "bset", ARGS_COUNT_2 },
+ { "chk", ARGS_COUNT_2 },
+ { "clr", ARGS_COUNT_1 },
+ { "cmp", ARGS_COUNT_2 },
+ { "cmpa", ARGS_COUNT_2 },
+ { "cmpi", ARGS_COUNT_2 },
+ { "cmpm", ARGS_COUNT_2 },
+ { "dbt", ARGS_COUNT_2 },
+ { "dbf", ARGS_COUNT_2 },
+ { "dbcc", ARGS_COUNT_2 },
+ { "dbcs", ARGS_COUNT_2 },
+ { "dbeq", ARGS_COUNT_2 },
+ { "dbge", ARGS_COUNT_2 },
+ { "dbgt", ARGS_COUNT_2 },
+ { "dbhi", ARGS_COUNT_2 },
+ { "dble", ARGS_COUNT_2 },
+ { "dbls", ARGS_COUNT_2 },
+ { "dblt", ARGS_COUNT_2 },
+ { "dbmt", ARGS_COUNT_2 },
+ { "dbne", ARGS_COUNT_2 },
+ { "dbpl", ARGS_COUNT_2 },
+ { "dbvc", ARGS_COUNT_2 },
+ { "dbvs", ARGS_COUNT_2 },
+ { "divu", ARGS_COUNT_2 },
+ { "divs", ARGS_COUNT_2 },
+ { "eori", ARGS_COUNT_2 },
+ { "exg", ARGS_COUNT_2 },
+ { "ext", ARGS_COUNT_1 },
+ { "illegal", ARGS_COUNT_0 },
+ { "jmp", ARGS_COUNT_1 },
+ { "jsr", ARGS_COUNT_1 },
+ { "lea", ARGS_COUNT_2 },
+ { "link", ARGS_COUNT_2 },
+ { "lsl", ARGS_COUNT_1_2 },
+ { "lsr", ARGS_COUNT_1_2 },
+ { "move", ARGS_COUNT_2 },
+ { "movea", ARGS_COUNT_2 },
+ { "movem", ARGS_COUNT_2 },
+ { "movep", ARGS_COUNT_2 },
+ { "moveq", ARGS_COUNT_2 },
+ { "muls", ARGS_COUNT_2 },
+ { "mulu", ARGS_COUNT_2 },
+ { "nbcd", ARGS_COUNT_1 },
+ { "neg", ARGS_COUNT_1 },
+ { "negx", ARGS_COUNT_1 },
+ { "nop", ARGS_COUNT_0 },
+ { "not", ARGS_COUNT_1 },
+ { "or", ARGS_COUNT_2 },
+ { "ori", ARGS_COUNT_2 },
+ { "pea", ARGS_COUNT_1 },
+ { "reset", ARGS_COUNT_0 },
+ { "rol", ARGS_COUNT_1_2 },
+ { "ror", ARGS_COUNT_1_2 },
+ { "roxl", ARGS_COUNT_1_2 },
+ { "roxr", ARGS_COUNT_1_2 },
+ { "rte", ARGS_COUNT_0 },
+ { "rtr", ARGS_COUNT_0 },
+ { "rts", ARGS_COUNT_0 },
+ { "sbcd", ARGS_COUNT_2 },
+ { "st", ARGS_COUNT_1 },
+ { "sf", ARGS_COUNT_1 },
+ { "scc", ARGS_COUNT_1 },
+ { "scs", ARGS_COUNT_1 },
+ { "seq", ARGS_COUNT_1 },
+ { "sge", ARGS_COUNT_1 },
+ { "sgt", ARGS_COUNT_1 },
+ { "shi", ARGS_COUNT_1 },
+ { "sle", ARGS_COUNT_1 },
+ { "sls", ARGS_COUNT_1 },
+ { "slt", ARGS_COUNT_1 },
+ { "smt", ARGS_COUNT_1 },
+ { "sne", ARGS_COUNT_1 },
+ { "spl", ARGS_COUNT_1 },
+ { "svc", ARGS_COUNT_1 },
+ { "svs", ARGS_COUNT_1 },
+ { "stop", ARGS_COUNT_1 },
+ { "sub", ARGS_COUNT_2 },
+ { "suba", ARGS_COUNT_2 },
+ { "subi", ARGS_COUNT_2 },
+ { "subq", ARGS_COUNT_2 },
+ { "subx", ARGS_COUNT_2 },
+ { "swap", ARGS_COUNT_1 },
+ { "tas", ARGS_COUNT_1 },
+ { "trap", ARGS_COUNT_1 },
+ { "trapv", ARGS_COUNT_0 },
+ { "tst", ARGS_COUNT_1 },
+ { "unlk", ARGS_COUNT_1 },
+};
+
static bool should_be_escaped(const int c)
{
return c < ' ' || c == '"' || c == '\\' || c == '<' || c == '>' || c > '~';
@@ -312,7 +582,7 @@ static int fprint_string_escaped(
return written;
}
-static const char *tok_kind_to_string(const enum token_type type)
+static const char *token_type_to_string(const enum token_type type)
{
switch (type) {
case TT_NONE: return "NONE";
@@ -332,7 +602,6 @@ static const char *tok_kind_to_string(const enum token_type type)
case TT_RIGHT_SHIFT: return "RIGHT_SHIFT";
case TT_HASH: return "HASH";
case TT_BANG: return "BANG";
- case TT_DOLLAR: return "DOLLAR";
case TT_TILDE: return "TILDE";
case TT_AMPERSAND: return "AMPERSAND";
case TT_PIPE: return "PIPE";
@@ -351,13 +620,13 @@ static const char *tok_kind_to_string(const enum token_type type)
case TT_COMMENT_ASTERISK: return "COMMENT";
case TT_COMMENT_SEMICOLON: return "COMMENT";
}
- assert(0);
- return "UNKNOWN";
+ UNREACHABLE();
+ return "_UNKNOWN";
}
static int fprint_tok(const char *const input, struct token *token, FILE *const stream)
{
- int res = fprintf(stream, "%s<", tok_kind_to_string(token->type));
+ int res = fprintf(stream, "%s<", token_type_to_string(token->type));
if (res == -1) {
return -1;
}
@@ -375,6 +644,13 @@ static int fprint_tok(const char *const input, struct token *token, FILE *const
return written;
}
+static int fwrite_token(const struct token *const token, FILE *const stream)
+{
+ const int res = fwrite(token, sizeof *token, 1, stream);
+ assert(res == 1);
+ return res;
+}
+
static int lex_init(struct lex *const self)
{
*self = (struct lex){
@@ -384,16 +660,13 @@ static int lex_init(struct lex *const self)
};
assert(self->input_stream != NULL);
assert(self->tokbuf_stream != NULL);
+ // Place a dummy token at 0 index, so first real token will be at index 1.
+ // This is needed for parser, so it can use zero to indicate absence of
+ // token.
+ fwrite_token(&(struct token){TT_NONE}, self->tokbuf_stream);
return OK;
}
-static int fwrite_token(const struct token *const token, FILE *const stream)
-{
- const int res = fwrite(token, sizeof *token, 1, stream);
- assert(res == 1);
- return res;
-}
-
static void lex_yield_token(struct lex *const self, const struct token *const token)
{
self->inside_line = (token->type != TT_NEWLINE) && (token->type != TT_ESCAPE);
@@ -443,18 +716,19 @@ static const char *lex_state_error_string(
case LS_COMMENT_SEMICOLON:
case LS_ERROR:
case LS_EOF:
- assert(0);
+ UNREACHABLE();
break;
}
return "???";
}
-static struct line_pos_info lex_get_line_pos_info(const struct lex *const self)
+static struct line_pos_info lex_get_line_pos_info(
+ const struct lex *const self, const size_t cursor)
{
struct line_pos_info l = {0, 0, 0};
bool cr = false;
// `input` is null terminated, that's why we subtract 1 here
- for (size_t i = 0; i < self->input_size - 1; i++) {
+ for (size_t i = 0; i < cursor; i++) {
const char c = self->input[i];
if (c == '\r') {
cr = true;
@@ -479,8 +753,8 @@ static struct line_pos_info lex_get_line_pos_info(const struct lex *const self)
static int lex_yield_error(struct lex *const self, const int c)
{
fflush(self->input_stream);
- const struct line_pos_info l = lex_get_line_pos_info(self);
const size_t cursor = self->cursor;
+ const struct line_pos_info l = lex_get_line_pos_info(self, cursor);
{
// Read out the rest of the line
int c;
@@ -531,6 +805,9 @@ static int lex_handle_next(struct lex *const self, const int c)
} else if (c == '@') {
self->tok_offset = self->cursor;
self->state = LS_NUMOCT;
+ } else if (c == '$') {
+ self->tok_offset = self->cursor;
+ self->state = LS_NUMHEX;
} else if (c == '"') {
self->tok_offset = self->cursor;
self->state = LS_STRING;
@@ -584,8 +861,6 @@ static int lex_handle_next(struct lex *const self, const int c)
lex_yield_token(self, &(struct token){TT_HASH, self->cursor, 1});
} else if (c == '!') {
lex_yield_token(self, &(struct token){TT_BANG, self->cursor, 1});
- } else if (c == '$') {
- lex_yield_token(self, &(struct token){TT_DOLLAR, self->cursor, 1});
} else if (c == '~') {
lex_yield_token(self, &(struct token){TT_TILDE, self->cursor, 1});
} else if (c == '&') {
@@ -757,7 +1032,7 @@ static int lex_handle_next(struct lex *const self, const int c)
case LS_ERROR:
return ERR;
case LS_EOF:
- assert(0);
+ UNREACHABLE();
}
return CONTINUE;
}
@@ -815,99 +1090,619 @@ static void lex_destroy(struct lex *const self)
free(self->tokbuf);
}
+static enum args_count get_args_count_for_mnemonic(const enum mnemonic m)
+{
+ assert(m < MNEMONICS_COUNT);
+ return g_mnemmonics[m].args_count;
+}
+
+static const char *mnemonic_to_string(const enum mnemonic m)
+{
+ assert(m < MNEMONICS_COUNT);
+ return g_mnemmonics[m].str;
+}
+
+static const char *opsize_to_string(const enum opsize s)
+{
+ switch (s) {
+ case OPSIZE_NONE: return "none";
+ case OPSIZE_S: return "short";
+ case OPSIZE_B: return "byte";
+ case OPSIZE_W: return "word";
+ case OPSIZE_L: return "long";
+ }
+ UNREACHABLE();
+ return "_unknown";
+}
+
+static enum mnemonic get_mnemonic_from_identifier(
+ const char *const str, const size_t str_length)
+{
+ if (str_length > 7) {
+ return MN_NONE;
+ }
+ char mnemonic_str[8] = {0};
+ for (size_t i = 0; i < str_length; i++) {
+ mnemonic_str[i] = tolower(str[i]);
+ }
+ // Start from 1 since - is dummy NONE
+ for (size_t i = 1; i < MNEMONICS_COUNT; i++) {
+ if (0 == strcmp(mnemonic_str, g_mnemmonics[i].str)) {
+ return (enum mnemonic)i;
+ }
+ }
+ return MN_NONE;
+}
+
+static const char *arg_type_to_string(const enum arg_type type)
+{
+ switch (type) {
+ case ARG_NONE: return "NONE";
+ case ARG_DN: return "Dn";
+ case ARG_AN: return "An";
+ case ARG_AN_ADDR: return "(An)";
+ case ARG_AN_ADDR_INCR: return "(An)+";
+ case ARG_AN_ADDR_DECR: return "-(An)";
+ case ARG_AN_ADDR_16: return "(d16,An)";
+ case ARG_AN_ADDR_8_XI: return "(d8,An,Xi)";
+ case ARG_ADDR_WORD: return "(xxx).w";
+ case ARG_ADDR_LONG: return "(xxx).l";
+ case ARG_ADDR_UNSPEC: return "(xxx).?";
+ case ARG_PC_ADDR_16: return "(d16,PC)";
+ case ARG_PC_ADDR_8_XN: return "(d8,PC,Xn)";
+ case ARG_IMMEDIATE: return "#imm";
+ case ARG_SR: return "SR";
+ case ARG_CCR: return "CCR";
+ case ARG_USP: return "USP";
+ case ARG_EXPR: return "EXPR";
+ }
+ UNREACHABLE();
+ return "_UNKNOWN";
+}
+
static int pars_init(struct pars *const self, const struct lex *const lex)
{
*self = (struct pars){
.lex = lex,
.stmttab_stream = open_memstream(
(char **)&self->stmttab, &self->stmttab_size),
+ .symtab_stream = open_memstream(
+ (char **)&self->symtab, &self->symtab_size),
.symbuf_stream = open_memstream(&self->symbuf, &self->symbuf_size),
};
assert(self->stmttab_stream != NULL);
+ assert(self->symtab_stream != NULL);
assert(self->symbuf_stream != NULL);
return OK;
}
-static int pars_yield_error(struct pars *const self)
+static bool pars_is_eof_reached(const struct pars *const self)
+{
+ const size_t tokens_count = self->lex->tokbuf_size /
+ (sizeof *self->lex->tokbuf);
+ return self->cur_tok_id >= tokens_count;
+}
+
+static const char *stmt_type_to_string(const enum stmt_type type)
+{
+ switch (type) {
+ case ST_NONE: return "NONE";
+ case ST_LABEL: return "LABEL";
+ case ST_INSTRUCTION: return "INSTRUCTION";
+ case ST_ASSIGNMENT: return "ASSIGNMENT";
+ case ST_COMMENT: return "COMMENT";
+ case ST_DIRECTIVE: return "DIRECTIVE";
+ }
+ return "_UNKNOWN";
+}
+
+static void fprint_arg(
+ const struct lex *const lex,
+ const struct arg *const arg,
+ FILE *const s)
+{
+ fprintf(s, "(%s", arg_type_to_string(arg->arg_type));
+ switch (arg->arg_type) {
+ case ARG_NONE:
+ case ARG_DN:
+ case ARG_AN:
+ case ARG_AN_ADDR:
+ case ARG_AN_ADDR_INCR:
+ case ARG_AN_ADDR_DECR:
+ fprintf(s, " reg %d", arg->arg_contents.xn);
+ break;
+ case ARG_AN_ADDR_16:
+ fprintf(s, " reg %d", arg->arg_contents.arg_16.an);
+ fprintf(s, " d16 %d", arg->arg_contents.arg_16.d);
+ break;
+ case ARG_AN_ADDR_8_XI:
+ fprintf(s, " reg %d", arg->arg_contents.arg_8.an);
+ fprintf(s, " d8 %d", arg->arg_contents.arg_8.d);
+ fprintf(s, " xi %d", arg->arg_contents.arg_8.xi);
+ break;
+ case ARG_ADDR_WORD:
+ case ARG_ADDR_LONG:
+ case ARG_ADDR_UNSPEC:
+ fprintf(s, " addr %d", arg->arg_contents.addr);
+ break;
+ case ARG_PC_ADDR_16:
+ fprintf(s, " d16 %d", arg->arg_contents.arg_16.d);
+ break;
+ case ARG_PC_ADDR_8_XN:
+ fprintf(s, " d8 %d", arg->arg_contents.arg_8.d);
+ fprintf(s, " xn %d", arg->arg_contents.arg_8.xi);
+ break;
+ case ARG_IMMEDIATE:
+ fprintf(s, " value %d", arg->arg_contents.imm);
+ break;
+ case ARG_SR:
+ case ARG_CCR:
+ case ARG_USP:
+ case ARG_EXPR:
+ break;
+ }
+ fprintf(s, " raw \"");
+ for (size_t i = 0; i < arg->num_tokens; i++) {
+ const struct token token = lex->tokbuf[arg->first_token + i];
+ if (token.type == TT_NEWLINE) {
+ break;
+ }
+ fprintf(s, "%.*s ", (int)token.length, lex->input + token.offset);
+ }
+ fprintf(s, "\")");
+}
+
+static int fprint_stmt(
+ const struct lex *const lex,
+ struct stmt *const stmt,
+ FILE *const s)
+{
+ assert(stmt);
+ fprintf(s, "(%s", stmt_type_to_string(stmt->type));
+ if (stmt->label_token) {
+ const struct token label = lex->tokbuf[stmt->label_token];
+ fprintf(s, "\n\t(label \"%.*s\")", (int)label.length, lex->input + label.offset);
+ }
+ if (stmt->type == ST_INSTRUCTION) {
+ fprintf(s, "\n\t(mnemonic \"%s\")", mnemonic_to_string(stmt->instruction.mnemonic));
+ fprintf(s, "\n\t(size %s)", opsize_to_string(stmt->instruction.opsize));
+ if (stmt->instruction.arg1.arg_type != ARG_NONE) {
+ fprintf(s, "\n\t(arg1 ");
+ fprint_arg(lex, &stmt->instruction.arg1, s);
+ fprintf(s, ")");
+ }
+ if (stmt->instruction.arg2.arg_type != ARG_NONE) {
+ assert(stmt->instruction.arg1.arg_type != ARG_NONE);
+ fprintf(s, "\n\t(arg2 ");
+ fprint_arg(lex, &stmt->instruction.arg2, s);
+ fprintf(s, ")");
+ }
+ }
+ if (stmt->comment_token) {
+ const struct token comment = lex->tokbuf[stmt->comment_token];
+ fprintf(s, "\n\t(comment \"%.*s\")", (int)comment.length, lex->input + comment.offset);
+ }
+ fprintf(s, "\n\t(raw \"");
+ for (size_t i = 0; i < stmt->num_tokens; i++) {
+ const struct token token = lex->tokbuf[stmt->first_token + i];
+ if (token.type == TT_NEWLINE) {
+ break;
+ }
+ fprintf(s, "%.*s ", (int)token.length, lex->input + token.offset);
+ }
+ fprintf(s, "\"))\n");
+ return 0;
+}
+
+static int fwrite_stmt(const struct stmt *const stmt, FILE *const stream)
+{
+ const int res = fwrite(stmt, sizeof *stmt, 1, stream);
+ assert(res == 1);
+ return res;
+}
+
+static size_t find_line_length(const char *const str)
+{
+ for (size_t i = 0;; i++) {
+ const char c = str[i];
+ if (c == '\n' || c == '\r' || c == '\000') {
+ return i;
+ }
+ }
+ return 0;
+}
+
+static int pars_yield_error_str(
+ struct pars *const self, const struct line_pos_info l, char *const str)
+{
+ fprintf(
+ stderr,
+ "<stdin>:%lu:%lu: parsing error: expected %s, found '%s'\n",
+ l.line_num + 1,
+ l.column_num + 1,
+ "<TODO>",
+ str);
+ free(str);
+ const size_t line_length = find_line_length(self->lex->input + l.line_offset);
+ char *const line = calloc(1, line_length + 1);
+ memcpy(line, self->lex->input + l.line_offset, line_length);
+ fprintf(stderr, "%5lu | %s\n", l.line_num + 1, line);
+ free(line);
+ fputs(" | ", stderr);
+ for (size_t i = 0; i < l.column_num; i++) {
+ if (self->lex->input[l.line_offset + i] == '\t') {
+ fputc('\t', stderr);
+ } else {
+ fputc(' ', stderr);
+ }
+ }
+ fputs("^\n", stderr);
+ return ERR;
+}
+
+static int pars_yield_error(struct pars *const self, const size_t token_id)
+{
+ const struct token token = self->lex->tokbuf[token_id];
+ const struct line_pos_info l =
+ lex_get_line_pos_info(self->lex, token.offset);
+ char *const found = calloc(1, token.length + 10);
+ snprintf(found, token.length + 1, "%s", self->lex->input + token.offset);
+ return pars_yield_error_str(self, l, found);
+}
+
+static int pars_yield_error_nesting(
+ struct pars *const self,
+ const size_t expression_start_token_id,
+ const size_t expression_length_tokens)
{
(void) self;
- // TODO
+ (void) expression_start_token_id;
+ (void) expression_length_tokens;
return ERR;
}
-static int pars_parse_label(struct pars *const self)
+static int pars_yield_error_eof(struct pars *const self)
{
- const struct token label = self->lex->tokbuf[self->cur_tok_id++];
- if (label.type != TT_IDENTIFIER) {
- return pars_yield_error(self);
- }
- // TODO
- return OK;
+ (void) self;
+ return ERR;
}
-static int pars_parse_instr(struct pars *const self, const struct token mnemo)
+static int pars_parse_direc(
+ struct pars *const self, const struct token *const dot)
{
(void) self;
- (void) mnemo;
- // TODO
+ (void) dot;
+ return pars_yield_error(self, self->cur_tok_id);
+}
+
+enum opsize get_opsize_from_specifier(const char size_specifier)
+{
+ switch (tolower(size_specifier)) {
+ case 's': return OPSIZE_S;
+ case 'b': return OPSIZE_B;
+ case 'w': return OPSIZE_W;
+ case 'l': return OPSIZE_L;
+ }
+ return OPSIZE_NONE;
+}
+
+static bool is_expression_token(const enum token_type type)
+{
+ switch (type) {
+ case TT_PLUS: return true;
+ case TT_MINUS: return true;
+ case TT_ASTERISK: return true;
+ case TT_SLASH: return true;
+ case TT_PERCENT: return true;
+ case TT_LEFT_SHIFT: return true;
+ case TT_RIGHT_SHIFT: return true;
+ case TT_HASH: return true;
+ case TT_BANG: return true;
+ case TT_TILDE: return true;
+ case TT_AMPERSAND: return true;
+ case TT_PIPE: return true;
+ case TT_CAP: return true;
+ case TT_IDENTIFIER: return true;
+ case TT_NUMDEC: return true;
+ case TT_NUMOCT: return true;
+ case TT_NUMHEX: return true;
+ case TT_LPAREN: return true;
+ case TT_RPAREN: return true;
+ default: return false;
+ }
+ return false;
+}
+
+static int pars_parse_arg(
+ struct pars *const self, struct arg *const arg)
+{
+ const size_t tokens_count = self->lex->tokbuf_size /
+ (sizeof *self->lex->tokbuf);
+ const size_t first_token_id = self->cur_tok_id;
+ int nesting = 0;
+ enum arg_type arg_type = ARG_EXPR;
+ while (self->cur_tok_id < tokens_count) {
+ const size_t token_id = self->cur_tok_id; // Peek
+ const struct token token = self->lex->tokbuf[token_id];
+ if (token.type == TT_LPAREN) {
+ nesting++;
+ } else if (token.type == TT_RPAREN) {
+ nesting--;
+ } else if (is_expression_token(token.type)) {
+ // TODO parse expression
+ } else if (nesting > 0 && token.type == TT_COMMA) {
+ // Comma inside parentheses is allowed
+ } else {
+ break;
+ }
+ self->cur_tok_id++; // Commit
+ }
+ if (nesting != 0) {
+ return pars_yield_error_nesting(
+ self, first_token_id, self->cur_tok_id - first_token_id);
+ }
+ if (first_token_id == self->cur_tok_id) {
+ // Nothing has been parsed
+ *arg = (struct arg){0};
+ } else {
+ *arg = (struct arg){
+ .arg_type = arg_type,
+ // TODO arg_contents
+ .first_token = first_token_id,
+ .num_tokens = self->cur_tok_id - first_token_id,
+ };
+ }
return OK;
}
-static int pars_parse_direc(struct pars *const self)
+static int pars_yield_instruction(
+ struct pars *const self,
+ const size_t label_id,
+ const size_t comment_id,
+ const size_t mnemonic_id,
+ const enum opsize opsize,
+ const struct arg *const arg1,
+ const struct arg *const arg2)
{
- const char *input = self->lex->input;
- const struct token direc = self->lex->tokbuf[self->cur_tok_id++];
- if (direc.type != TT_IDENTIFIER) {
- return pars_yield_error(self);
+ const struct token mnemonic_token = self->lex->tokbuf[mnemonic_id];
+ const enum mnemonic mnemonic = get_mnemonic_from_identifier(
+ self->lex->input + mnemonic_token.offset, mnemonic_token.length);
+ if (mnemonic == MN_NONE) {
+ return pars_yield_error(self, mnemonic_id);
+ }
+ if (arg2) {
+ assert(arg1);
}
- if (0 == strcmp(input + direc.offset, "def")) {
- } else if (0 == strcmp(input + direc.offset, "opt")) {
- } else if (0 == strcmp(input + direc.offset, "file")) {
- } else if (0 == strcmp(input + direc.offset, "text")) {
- } else if (0 == strcmp(input + direc.offset, "align")) {
- } else if (0 == strcmp(input + direc.offset, "globl")) {
- } else if (0 == strcmp(input + direc.offset, "ln")) {
- } else if (0 == strcmp(input + direc.offset, "long")) {
- } else if (0 == strcmp(input + direc.offset, "word")) {
- } else if (0 == strcmp(input + direc.offset, "byte")) {
- } else if (0 == strcmp(input + direc.offset, "bin")) {
+ const enum args_count args_count = get_args_count_for_mnemonic(mnemonic);
+ // Validate instruction arguments count
+ switch (args_count) {
+ case ARGS_COUNT_UNKNOWN:
+ UNREACHABLE();
+ break;
+ case ARGS_COUNT_0:
+ if (arg1) {
+ return pars_yield_error(self, arg1->first_token);
+ }
+ break;
+ case ARGS_COUNT_0_1:
+ if (arg2) {
+ return pars_yield_error(self, arg2->first_token);
+ }
+ break;
+ case ARGS_COUNT_0_1_2:
+ break;
+ case ARGS_COUNT_0_2:
+ if (arg1 && !arg2) {
+ return pars_yield_error(self, mnemonic_id);
+ }
+ break;
+ case ARGS_COUNT_1:
+ if (!arg1) {
+ return pars_yield_error(self, mnemonic_id);
+ } else if (arg2) {
+ return pars_yield_error(self, arg2->first_token);
+ }
+ break;
+ case ARGS_COUNT_1_2:
+ if (!arg1) {
+ return pars_yield_error(self, mnemonic_id);
+ }
+ break;
+ case ARGS_COUNT_2:
+ if (!arg1 || !arg2) {
+ return pars_yield_error(self, mnemonic_id);
+ }
+ break;
}
- // TODO
+ const size_t first_token_id = label_id ? label_id : mnemonic_id;
+ const struct stmt stmt = {
+ .type = ST_INSTRUCTION,
+ .instruction = {
+ .mnemonic = mnemonic,
+ .opsize = opsize,
+ .arg1 = arg1 ? *arg1 : (struct arg){0},
+ .arg2 = arg2 ? *arg2 : (struct arg){0},
+ },
+ .label_token = label_id,
+ .comment_token = comment_id,
+ .first_token = first_token_id,
+ .num_tokens = self->cur_tok_id - first_token_id,
+ };
+ fwrite_stmt(&stmt, self->stmttab_stream);
return OK;
}
-static int pars_parse_instr_or_direc(struct pars *const self)
+static int pars_parse_instruction_comment(
+ struct pars *const self,
+ const size_t label_id,
+ const size_t mnemonic_id,
+ const enum opsize opsize,
+ const struct arg *const arg1,
+ const struct arg *const arg2)
{
- struct token token;
- if (token.type == TT_DOT) {
- return pars_parse_direc(self);
- } else if (token.type == TT_IDENTIFIER) {
- return pars_parse_instr(self, token);
- } else {
- return pars_yield_error(self);
+ size_t comment_id = 0;
+ if (!pars_is_eof_reached(self)) {
+ // Try parse comment
+ const size_t token1_id = self->cur_tok_id; // Peek comment
+ const struct token token1 = self->lex->tokbuf[token1_id];
+ const bool is_comment = token1.type == TT_COMMENT_ASTERISK ||
+ token1.type == TT_COMMENT_SEMICOLON;
+ if (is_comment) {
+ self->cur_tok_id++; // Commit comment
+ comment_id = token1_id;
+ }
+ if (!pars_is_eof_reached(self)) {
+ // Handle new line
+ const size_t nl_id = self->cur_tok_id++; // Commit new line
+ const struct token nl = self->lex->tokbuf[nl_id];
+ if (nl.type != TT_NEWLINE) {
+ return pars_yield_error(self, nl_id);
+ }
+ }
+ }
+ return pars_yield_instruction(
+ self, label_id, comment_id, mnemonic_id, opsize, arg1, arg2);
+}
+
+static int pars_parse_instruction_args(
+ struct pars *const self,
+ const size_t label_id,
+ const size_t mnemonic_id,
+ const enum opsize opsize)
+{
+ struct arg arg1, arg2;
+ // Try parse first argument
+ const int res1 = pars_parse_arg(self, &arg1);
+ if (res1 != OK) {
+ return res1;
+ }
+ if (arg1.arg_type == ARG_NONE) {
+ return pars_parse_instruction_comment(
+ self, label_id, mnemonic_id, opsize, NULL, NULL);
+ }
+ if (pars_is_eof_reached(self)) {
+ return pars_yield_instruction(
+ self, label_id, 0, mnemonic_id, opsize, &arg1, NULL);
+ }
+ const size_t comma_id = self->cur_tok_id; // Peek comma
+ const struct token comma = self->lex->tokbuf[comma_id];
+ if (comma.type != TT_COMMA) {
+ return pars_parse_instruction_comment(
+ self, label_id, mnemonic_id, opsize, NULL, NULL);
+ }
+ self->cur_tok_id++; // Commit comma
+ // Try parse second argument
+ const int res2 = pars_parse_arg(self, &arg2);
+ if (res2 != OK) {
+ return res2;
+ }
+ if (pars_is_eof_reached(self)) {
+ return pars_yield_instruction(
+ self, label_id, 0, mnemonic_id, opsize, &arg1, &arg2);
+ }
+ // Finish parsing instruction, expect comment or newline
+ return pars_parse_instruction_comment(
+ self, label_id, mnemonic_id, opsize, &arg1, &arg2);
+}
+
+static int pars_parse_instruction(
+ struct pars *const self,
+ const size_t label_id,
+ const size_t mnemonic_id)
+{
+ if (pars_is_eof_reached(self)) {
+ return pars_yield_error_eof(self);
+ }
+ const size_t token2_id = self->cur_tok_id; // Peek
+ const struct token token2 = self->lex->tokbuf[token2_id];
+ if (token2.type == TT_DOT) {
+ self->cur_tok_id++; // Commit
+ if (pars_is_eof_reached(self)) {
+ return pars_yield_error_eof(self);
+ }
+ const size_t size_spec_id = self->cur_tok_id++;
+ const struct token size_spec = self->lex->tokbuf[size_spec_id];
+ if (size_spec.type != TT_IDENTIFIER) {
+ return pars_yield_error(self, size_spec_id);
+ }
+ // Size specifier
+ if (size_spec.length != 1) {
+ return pars_yield_error(self, size_spec_id);
+ }
+ const size_t opsize =
+ get_opsize_from_specifier(self->lex->input[size_spec.offset]);
+ if (opsize == OPSIZE_NONE) {
+ return pars_yield_error(self, size_spec_id);
+ }
+ if (pars_is_eof_reached(self)) {
+ return pars_yield_error_eof(self);
+ }
+ return pars_parse_instruction_args(self, label_id, mnemonic_id, opsize);
+ }
+ return pars_parse_instruction_args(
+ self, label_id, mnemonic_id, OPSIZE_NONE);
+}
+
+static int pars_parse_assignment(
+ struct pars *const self, const size_t label_id, const size_t symbol_id)
+{
+ (void) label_id;
+ (void) symbol_id;
+ return pars_yield_error(self, self->cur_tok_id);
+}
+
+static int pars_yield_label_comment(
+ struct pars *const self, const size_t label_id, const size_t comment_id)
+{
+ if (label_id || comment_id) {
+ const size_t first_token = label_id ? label_id : comment_id;
+ struct stmt stmt = {
+ .type = label_id ? ST_LABEL : ST_COMMENT,
+ .label_token = label_id,
+ .comment_token = comment_id,
+ .first_token = first_token,
+ .num_tokens = self->cur_tok_id - first_token,
+ };
+ fwrite_stmt(&stmt, self->stmttab_stream);
}
return OK;
}
-static int pars_parse_statement(struct pars *const self)
+static int pars_parse_labeled_statement(
+ struct pars *const self, const size_t label_id)
{
- const size_t tokens_count = self->lex->tokbuf_size /
- (sizeof *self->lex->tokbuf);
- const struct token token = self->lex->tokbuf[self->cur_tok_id++];
- const bool is_comment = token.type == TT_COMMENT_ASTERISK ||
- token.type == TT_COMMENT_SEMICOLON;
- if (token.type == TT_IDENTIFIER) {
- return pars_parse_label(self);
- } else if (!is_comment) {
- if (self->cur_tok_id < tokens_count) {
- const struct token nl = self->lex->tokbuf[self->cur_tok_id++];
- assert(nl.type == TT_NEWLINE);
+ const size_t token1_id = self->cur_tok_id++;
+ const struct token token1 = self->lex->tokbuf[token1_id];
+ const bool is_comment = token1.type == TT_COMMENT_ASTERISK ||
+ token1.type == TT_COMMENT_SEMICOLON;
+ if (is_comment) {
+ return pars_yield_label_comment(self, label_id, token1_id);
+ } else if (token1.type == TT_NEWLINE) {
+ return pars_yield_label_comment(self, label_id, 0);
+ } else if (token1.type == TT_IDENTIFIER) {
+ if (pars_is_eof_reached(self)) {
+ return pars_yield_error_eof(self);
}
- return OK;
- } else if (token.type == TT_NEWLINE) {
- return OK;
+ const size_t token2_id = self->cur_tok_id; // Peek
+ const struct token token2 = self->lex->tokbuf[token2_id];
+ if (!label_id && token2.type == TT_COLON) {
+ self->cur_tok_id++; // Commit
+ return pars_parse_labeled_statement(self, token1_id);
+ } else if (token2.type == TT_EQ || token2.type == TT_EQ_DOUBLE) {
+ self->cur_tok_id++; // Commit
+ return pars_parse_assignment(self, label_id, token2_id);
+ } else {
+ return pars_parse_instruction(self, label_id, token1_id);
+ }
+ } else if (token1.type == TT_DOT) {
+ return pars_parse_direc(self, &token1);
}
- return pars_yield_error(self);
+ return pars_yield_error(self, token1_id);
+}
+
+static int pars_parse_statement(struct pars *const self)
+{
+ return pars_parse_labeled_statement(self, 0);
}
/** Run parser until the end of the input reached
@@ -916,22 +1711,31 @@ static int pars_parse_statement(struct pars *const self)
*/
static int pars_run(struct pars *const self)
{
- return OK;
const size_t tokens_count = self->lex->tokbuf_size /
(sizeof *self->lex->tokbuf);
- do {
- const int ret = pars_parse_statement(self);
+ // Skip dummy token at position 0
+ self->cur_tok_id = 1;
+ // Leave dummy statement at position 0
+ fwrite_stmt(&(struct stmt){0}, self->stmttab_stream);
+ int ret = OK;
+ while (self->cur_tok_id < tokens_count) {
+ ret = pars_parse_statement(self);
if (ret != OK) {
- return ret;
+ break;
}
- } while (self->cur_tok_id < tokens_count);
- return OK;
+ }
+ fflush(self->stmttab_stream);
+ fflush(self->symtab_stream);
+ fflush(self->symbuf_stream);
+ return ret;
}
static void pars_destroy(struct pars *const self)
{
fclose(self->stmttab_stream);
free(self->stmttab);
+ fclose(self->symtab_stream);
+ free(self->symtab);
fclose(self->symbuf_stream);
free(self->symbuf);
}
@@ -952,12 +1756,18 @@ static int assem_resolve(struct assem *const self)
static int assem_emit(struct assem *const self, FILE *const stream)
{
- if (TRACE_LEX) {
- const struct lex *const lex = self->pars->lex;
- for (size_t i = 0; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) {
+ const struct lex *const lex = self->pars->lex;
+ const struct pars *const pars = self->pars;
+ if (TRACE_LEXER) {
+ for (size_t i = 1; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) {
fprint_tok(lex->input, &lex->tokbuf[i], stream);
}
}
+ if (TRACE_PARSER) {
+ for (size_t i = 1; i < pars->stmttab_size / (sizeof *pars->stmttab); i++) {
+ fprint_stmt(lex, pars->stmttab + i, stream);
+ }
+ }
return OK;
}