Impl .align and .file directives parsing

author: Oxore <oxore@protonmail.com> 2023-06-28 00:39:26 +0300
committer: Oxore <oxore@protonmail.com> 2023-06-28 00:52:44 +0300
commit: f77ac6908cca9fb2be32d04fd6387d417cbc9c39 (patch)
tree: afbed08bd677bb186351f1b82a4d9d9f5be4fc0d
parent: 91bc07e04e9009d5eddaacf7002551046699d313 (diff)
1 files changed, 210 insertions, 55 deletions
diff --git a/main.c b/main.c
index df5849c..15cdd87 100644
--- a/main.c
+++ b/main.c
@@ -28,7 +28,7 @@
 #define UNREACHABLE()
 #endif
 
-#define E_UNIMPL "unimplemented"
+#define E_NIMPL "not implemented"
 #define E_UNREACH "unreachable code reached"
 #define E_EXPR "'(', ')', unary operator, binary operator, number or symbol"
 #define E_EXPR_NONREG "symbol that is not a register when parsing expression"
@@ -52,10 +52,13 @@
 #define E_ADDR_SIZE_SPEC "'.b', '.w' or '.l'"
 #define E_ARGS_COUNT "invalid arguments count"
 #define E_NL "new line, which is '\\n', '\\r\\n' or '\\r'"
-#define E_INSTR_END "',', comment or " E_NL
+#define E_COMMENT_NL "';' or " E_NL
+#define E_INSTR_END "',', " E_COMMENT_NL
 #define E_LABELED_STMT "':', '=', '==' or " E_MNEMONIC
 #define E_DIRECTIVE "directive"
 #define E_STMT_BEGIN "label, " E_MNEMONIC ", " E_DIRECTIVE " or " E_NL
+#define E_UNKNOWN_DRC "unknown directive"
+#define E_STR "string"
 
 #define ERR 0
 #define OK 1
@@ -283,6 +286,13 @@ enum mnemonic {
     MNEMONICS_COUNT,
 };
 
+enum directive_type {
+    DT_NONE = 0,
+    DT_ALIGN,
+    DT_FILE,
+    DIRECTIVES_COUNT,
+};
+
 enum opsize {
     OPSIZE_NONE = 0,
     OPSIZE_S,
@@ -340,21 +350,16 @@ struct instruction {
     struct arg arg1, arg2;
 };
 
-struct def_endef {
-    size_t sym_id;
-    size_t tag_sym_id;
-    int32_t size;
-    int32_t storage_class;
-    int32_t type;
+struct directive {
+    enum directive_type type;
+    size_t first_token, num_tokens; /// Directive arguments tokens span
 };
 
 struct stmt {
     enum stmt_type type;
     union {
         struct instruction instruction;
-        int32_t align;
-        size_t globl_sym_id;
-        size_t file_sym_id;
+        struct directive directive;
     };
     size_t label_token;
     size_t first_token, num_tokens; // Statement tokens span, may be NULL
@@ -458,7 +463,7 @@ const char *const g_escape_table[256] = {
     "\\xfd", "\\xfe",
 };
 
-struct mnemonic_meta {
+const struct mnemonic_meta {
     const char *str;
     enum args_count args_count;
 } g_mnemmonics[MNEMONICS_COUNT] = {
@@ -579,6 +584,18 @@ struct mnemonic_meta {
     { "unlk",   ARGS_COUNT_1 },
 };
 
+static int pars_directive_handler_align(struct pars *, size_t);
+static int pars_directive_handler_file(struct pars *, size_t);
+
+const struct directive_description {
+    const char *str;
+    int (*handler)(struct pars *, size_t lable_id);
+} g_directives[DIRECTIVES_COUNT] = {
+    { "",           NULL, },
+    { "align",      pars_directive_handler_align, },
+    { "file",       pars_directive_handler_file, },
+};
+
 static bool should_be_escaped(const int c)
 {
     return c < ' ' || c == '"' || c == '\\' || c == '<' || c == '>' || c > '~';
@@ -1210,6 +1227,11 @@ static const char *mnemonic_to_string(const enum mnemonic m)
     return g_mnemmonics[m].str;
 }
 
+static const char *directive_to_string(const enum directive_type t) {
+    assert(t < DIRECTIVES_COUNT);
+    return g_directives[t].str;
+}
+
 static const char *opsize_to_string(const enum opsize s)
 {
     switch (s) {
@@ -1242,6 +1264,27 @@ static enum mnemonic get_mnemonic_from_identifier(
     return MN_NONE;
 }
 
+static enum directive_type get_directive_from_identifier(
+        const char *const str, const size_t str_length)
+{
+    // The longest directive have 8 chars (without leading dot), e.g.
+    // "bsection" or "external".
+    if (str_length > 8) {
+        return DT_NONE;
+    }
+    char directive_str[9] = {0};
+    for (size_t i = 0; i < str_length; i++) {
+        directive_str[i] = tolower(str[i]);
+    }
+    // Start from 1 since - is dummy NONE
+    for (size_t i = 1; i < DIRECTIVES_COUNT; i++) {
+        if (0 == strcmp(directive_str, g_directives[i].str)) {
+            return (enum directive_type)i;
+        }
+    }
+    return DT_NONE;
+}
+
 static const char *arg_type_to_string(const enum arg_type type)
 {
     switch (type) {
@@ -1303,6 +1346,25 @@ static const char *stmt_type_to_string(const enum stmt_type type)
     return "_UNKNOWN";
 }
 
+static int fprint_tokens(
+        const struct lex *const lex,
+        const size_t first_token,
+        const size_t num_tokens,
+        FILE *const s)
+{
+    for (size_t i = 0; i < num_tokens; i++) {
+        const struct token token = lex->tokbuf[first_token + i];
+        if (token.type == TT_NEWLINE) {
+            break;
+        }
+        if (i > 0) {
+            fputc(' ', s);
+        }
+        fprintf(s, "\"%.*s\"", (int)token.length, lex->input + token.offset);
+    }
+    return 0;
+}
+
 static void fprint_expr(
         const struct lex *const lex,
         const struct expr_tokens_span *const expr,
@@ -1386,18 +1448,9 @@ static void fprint_arg(
     case ARG_EXPR:
         break;
     }
-    fprintf(s, " raw \"");
-    for (size_t i = 0; i < arg->num_tokens; i++) {
-        const struct token token = lex->tokbuf[arg->first_token + i];
-        if (token.type == TT_NEWLINE) {
-            break;
-        }
-        if (i > 0) {
-            fputc(' ', s);
-        }
-        fprintf(s, "%.*s", (int)token.length, lex->input + token.offset);
-    }
-    fprintf(s, "\")");
+    fprintf(s, " raw-tokens [");
+    fprint_tokens(lex, arg->first_token, arg->num_tokens, s);
+    fprintf(s, "])");
 }
 
 static int fprint_stmt(
@@ -1409,10 +1462,17 @@ static int fprint_stmt(
     fprintf(s, "(%s", stmt_type_to_string(stmt->type));
     if (stmt->label_token) {
         const struct token label = lex->tokbuf[stmt->label_token];
-        fprintf(s, "\n\t(label \"%.*s\")", (int)label.length, lex->input + label.offset);
+        fprintf(
+                s,
+                "\n\t(label \"%.*s\")",
+                (int)label.length,
+                lex->input + label.offset);
     }
     if (stmt->type == ST_INSTRUCTION) {
-        fprintf(s, "\n\t(mnemonic \"%s\")", mnemonic_to_string(stmt->instruction.mnemonic));
+        fprintf(
+                s,
+                "\n\t(mnemonic \"%s\")",
+                mnemonic_to_string(stmt->instruction.mnemonic));
         fprintf(s, "\n\t(size %s)", opsize_to_string(stmt->instruction.opsize));
         if (stmt->instruction.arg1.type != ARG_NONE) {
             fprintf(s, "\n\t(arg1 ");
@@ -1425,23 +1485,30 @@ static int fprint_stmt(
             fprint_arg(lex, &stmt->instruction.arg2, s);
             fprintf(s, ")");
         }
+    } else if (stmt->type == ST_DIRECTIVE) {
+        fprintf(
+                s,
+                "\n\t(name \"%s\")",
+                directive_to_string(stmt->directive.type));
+        fprintf(s, "\n\t(arg (raw-tokens [");
+        fprint_tokens(
+                lex,
+                stmt->directive.first_token,
+                stmt->directive.num_tokens,
+                s);
+        fprintf(s, "]))");
     }
     if (stmt->comment_token) {
         const struct token comment = lex->tokbuf[stmt->comment_token];
-        fprintf(s, "\n\t(comment \"%.*s\")", (int)comment.length, lex->input + comment.offset);
-    }
-    fprintf(s, "\n\t(raw \"");
-    for (size_t i = 0; i < stmt->num_tokens; i++) {
-        const struct token token = lex->tokbuf[stmt->first_token + i];
-        if (token.type == TT_NEWLINE) {
-            break;
-        }
-        if (i > 0) {
-            fputc(' ', s);
-        }
-        fprintf(s, "%.*s", (int)token.length, lex->input + token.offset);
-    }
-    fprintf(s, "\"))\n");
+        fprintf(
+                s,
+                "\n\t(comment \"%.*s\")",
+                (int)comment.length,
+                lex->input + comment.offset);
+    }
+    fprintf(s, "\n\t(raw-tokens [");
+    fprint_tokens(lex, stmt->first_token, stmt->num_tokens, s);
+    fprintf(s, "]))\n");
     return 0;
 }
 
@@ -1537,14 +1604,6 @@ static int pars_yield_error_eof(
             self, l, "EOF", (sizeof "EOF") - 1, expected);
 }
 
-static int pars_parse_direc(
-        struct pars *const self, const struct token *const dot)
-{
-    (void) self;
-    (void) dot;
-    return pars_yield_error_msg(self, self->cur_tok_id, E_UNIMPL);
-}
-
 enum opsize get_opsize_from_specifier(const char size_specifier)
 {
     switch (tolower(size_specifier)) {
@@ -1728,7 +1787,7 @@ static int pars_parse_expr(
             }
             nesting--;
         } else {
-            if (nesting == 0) {
+            if (nesting == 0 && expect_close_or_binary) {
                 break;
             }
             return pars_yield_error(
@@ -1746,6 +1805,101 @@ static int pars_parse_expr(
     return OK;
 }
 
+static int pars_parse_comment_and_newline(
+        struct pars *const self, size_t *const output_comment_id)
+{
+    size_t comment_id = 0;
+    if (!pars_is_eof_reached(self)) {
+        // Try parse comment
+        const struct token token1 = pars_peek(self);
+        const bool is_comment = token1.type == TT_COMMENT_ASTERISK ||
+            token1.type == TT_COMMENT_SEMICOLON;
+        if (is_comment) {
+            comment_id = pars_commit(self);
+        }
+    }
+    if (!pars_is_eof_reached(self)) {
+        // There must be a new line if not EOF
+        const size_t nl_id = pars_commit(self);
+        const struct token nl = self->lex->tokbuf[nl_id];
+        if (nl.type != TT_NEWLINE) {
+            return pars_yield_error(
+                    self, nl_id, comment_id ? E_NL : E_COMMENT_NL);
+        }
+    }
+    *output_comment_id = comment_id;
+    return OK;
+}
+
+static int pars_finish_directive(
+        struct pars *const self,
+        const size_t label_id,
+        const struct directive directive)
+{
+    // Finish parsing instruction, expect comment or newline
+    size_t comment_id = 0;
+    const int ret = pars_parse_comment_and_newline(self, &comment_id);
+    if (ret != OK) {
+        return ret;
+    }
+    const struct stmt stmt = {
+        .type = ST_DIRECTIVE,
+        .directive = directive,
+        .label_token = label_id,
+        .comment_token = comment_id,
+        .first_token = label_id,
+        .num_tokens = self->cur_tok_id - label_id,
+    };
+    fwrite_stmt(&stmt, self->stmttab_stream);
+    return OK;
+}
+
+static int pars_directive_handler_align(
+        struct pars *const self, const size_t label_id)
+{
+    struct expr_tokens_span expr;
+    const int ret = pars_parse_expr(self, &expr);
+    if (ret != OK) {
+        return ret;
+    }
+    const struct directive directive = {
+        .type = DT_ALIGN,
+        .first_token = expr.first_token,
+        .num_tokens = expr.num_tokens,
+    };
+    return pars_finish_directive(self, label_id, directive);
+}
+
+static int pars_directive_handler_file(
+        struct pars *const self, const size_t label_id)
+{
+    const struct token filename = pars_peek(self);
+    if (filename.type != TT_STRING) {
+        return pars_yield_error(self, self->cur_tok_id, E_STR);
+    }
+    const size_t first_token = pars_commit(self);
+    const struct directive directive = {
+        .type = DT_FILE,
+        .first_token = first_token,
+        .num_tokens = 1
+    };
+    return pars_finish_directive(self, label_id, directive);
+}
+
+static int pars_parse_direc(struct pars *const self, const size_t label_id)
+{
+    const struct token dotid = pars_peek(self);
+    // Get rid of leading dot in the string pointer and in the length as well by
+    // adding and subtracting 1 respectively
+    enum directive_type d = get_directive_from_identifier(
+            self->lex->input + dotid.offset + 1, dotid.length - 1);
+    if (d == DT_NONE) {
+        return pars_yield_error_msg(self, self->cur_tok_id, E_UNKNOWN_DRC);
+    }
+    pars_commit(self);
+    return g_directives[d].handler(self, label_id);
+}
+
 static int pars_parse_arg_after_prefix_expr(
         struct pars *const self, struct arg *const arg)
 {
@@ -2328,7 +2482,7 @@ static int pars_parse_assignment(
 {
     (void) label_id;
     (void) symbol_id;
-    return pars_yield_error_msg(self, self->cur_tok_id, E_UNIMPL);
+    return pars_yield_error_msg(self, self->cur_tok_id, E_NIMPL);
 }
 
 static int pars_yield_label_comment(
@@ -2351,15 +2505,16 @@ static int pars_yield_label_comment(
 static int pars_parse_labeled_statement(
         struct pars *const self, const size_t label_id)
 {
-    const size_t token1_id = pars_commit(self);
-    const struct token token1 = self->lex->tokbuf[token1_id];
+    const struct token token1 = pars_peek(self);
     const bool is_comment = token1.type == TT_COMMENT_ASTERISK ||
         token1.type == TT_COMMENT_SEMICOLON;
     if (is_comment) {
-        return pars_yield_label_comment(self, label_id, token1_id);
+        return pars_yield_label_comment(self, label_id, pars_commit(self));
     } else if (token1.type == TT_NEWLINE) {
+        pars_commit(self);
         return pars_yield_label_comment(self, label_id, 0);
     } else if (token1.type == TT_ID) {
+        const size_t token1_id = pars_commit(self);
         if (pars_is_eof_reached(self)) {
             return pars_yield_error_eof(self, E_LABELED_STMT);
         }
@@ -2373,9 +2528,9 @@ static int pars_parse_labeled_statement(
         }
         return pars_parse_instruction(self, label_id, token1_id);
     } else if (token1.type == TT_DOT_ID) {
-        return pars_parse_direc(self, &token1);
+        return pars_parse_direc(self, label_id);
     }
-    return pars_yield_error(self, token1_id, E_STMT_BEGIN);
+    return pars_yield_error(self, self->cur_tok_id, E_STMT_BEGIN);
 }
 
 static int pars_parse_statement(struct pars *const self)
author	Oxore <oxore@protonmail.com>	2023-06-28 00:39:26 +0300
committer	Oxore <oxore@protonmail.com>	2023-06-28 00:52:44 +0300
commit	f77ac6908cca9fb2be32d04fd6387d417cbc9c39 (patch)
tree	afbed08bd677bb186351f1b82a4d9d9f5be4fc0d
parent	91bc07e04e9009d5eddaacf7002551046699d313 (diff)