diff options
| -rw-r--r-- | main.c | 196 | 
1 files changed, 145 insertions, 51 deletions
| @@ -8,9 +8,10 @@  #include <assert.h>  #include <errno.h>  #include <stdbool.h> +#include <stdint.h>  #include <stdio.h>  #include <stdlib.h> -#include <stdint.h> +#include <string.h>  #ifndef TRACE_LEX  #define TRACE_LEX 1 @@ -23,17 +24,18 @@  enum token_type {      TT_NONE = 0,      TT_SPACE, -    TT_TAB,      TT_NEWLINE, +    TT_ESCAPE,      TT_DOT,      TT_COMMA,      TT_PLUS,      TT_MINUS, +    TT_ASTERISK, +    TT_SLASH,      TT_EQ,      TT_COLON,      TT_PERCENT,      TT_HASH, -    TT_ASTERISK,      TT_STRING,      TT_IDENTIFIER,      TT_NUMDEC, @@ -60,7 +62,6 @@ enum lex_state {      LS_FREE = 0,      LS_CR,      LS_SPACE, -    LS_TAB,      LS_IDENTIFIER,      LS_NUMOCTHEX,      LS_NUMOCT, @@ -86,7 +87,6 @@ struct lex {      enum lex_error error;      size_t cursor;      size_t tok_offset; -    size_t tokens_count;      bool inside_line;      // Input data buffer      FILE *input_stream; @@ -111,7 +111,7 @@ enum stmt_kind {      SK_DIR_LINE,  }; -enum opcode { +enum mnemonic {      OPCODE_NONE,      OPCODE_NOP,  }; @@ -148,7 +148,7 @@ struct arg_8_xn {  };  struct instruction { -    enum opcode opcode; +    enum mnemonic mnemonic;      enum opsize opsize;      enum arg_kind arg1_kind, arg2_kind;      union { @@ -303,18 +303,19 @@ static const char *tok_kind_to_string(const enum token_type type)      switch (type) {      case TT_NONE: return "NONE";      case TT_SPACE: return "SPACE"; -    case TT_TAB: return "TAB";      case TT_NEWLINE: return "NEWLINE"; +    case TT_ESCAPE: return "ESCAPE";      case TT_DOT: return "DOT";      case TT_COMMA: return "COMMA";      case TT_PLUS: return "PLUS";      case TT_MINUS: return "MINUS"; +    case TT_ASTERISK: return "ASTERISK"; +    case TT_SLASH: return "SLASH";      case TT_EQ: return "EQ";      case TT_COLON: return "COLON";      case TT_PERCENT: return "PERCENT";      case TT_HASH: return "HASH";      case TT_STRING: return "STRING"; -    case TT_ASTERISK: return "ASTERISK";      case TT_IDENTIFIER: return "IDENTIFIER";      case TT_NUMDEC: return "NUMDEC";      case TT_NUMOCT: return "NUMOCT"; @@ -369,9 +370,8 @@ static int fwrite_token(const struct token *const token, FILE *const stream)  static void lex_yield_token(struct lex *const self, const struct token *const token)  { -    self->inside_line = token->type != TT_NEWLINE; +    self->inside_line = (token->type != TT_NEWLINE) && (token->type != TT_ESCAPE);      fwrite_token(token, self->tokbuf_stream); -    self->tokens_count++;  }  static const char *lex_state_error_string( @@ -406,7 +406,6 @@ static const char *lex_state_error_string(              "or EOF";      case LS_CR:      case LS_SPACE: -    case LS_TAB:      case LS_IDENTIFIER:      case LS_STRING:      case LS_STRING_ESC: @@ -433,11 +432,11 @@ static struct line_pos_info lex_get_line_pos_info(const struct lex *const self)              l.line_num++;              l.column_num = 0;          } else if (c == '\n') { -            cr = false; -            l.line_offset = i + 1;              if (!cr) {                  l.line_num++;              } +            cr = false; +            l.line_offset = i + 1;              l.column_num = 0;          } else {              cr = false; @@ -451,6 +450,7 @@ static int lex_yield_error(struct lex *const self, const int c)  {      fflush(self->input_stream);      const struct line_pos_info l = lex_get_line_pos_info(self); +    const size_t cursor = self->cursor;      {          // Read out the rest of the line          int c; @@ -461,13 +461,15 @@ static int lex_yield_error(struct lex *const self, const int c)          } while (c != EOF && c != '\n' && c != '\r');          fflush(self->input_stream);      } +    const unsigned char c_char = (c == EOF) ? 0 : c;      fprintf(              stderr, -            "<stdin>:%lu:%lu: lexing error: expected %s, found '%c'\n", +            "<stdin>:%lu:%lu: lexing error: expected %s, found '",              l.line_num + 1,              l.column_num + 1, -            lex_state_error_string(self->state, self->inside_line), -            c); +            lex_state_error_string(self->state, self->inside_line)); +    fputs(g_escape_table[c_char], stderr); +    fputs("'\n", stderr);      fprintf(stderr, "%5lu | %s\n", l.line_num + 1, self->input + l.line_offset);      fputs("      | ", stderr);      for (size_t i = 0; i < l.column_num; i++) { @@ -478,6 +480,7 @@ static int lex_yield_error(struct lex *const self, const int c)          }      }      fputs("^\n", stderr); +    fprintf(stderr, "<stdin>: %lu bytes parsed\n", cursor);      self->state = LS_ERROR;      return ERR;  } @@ -487,7 +490,6 @@ static int lex_handle_next(struct lex *const self, const int c)      switch (self->state) {      case LS_FREE:          if (is_alphabetic(c) || c == '_') { -            self->inside_line = false;              self->tok_offset = self->cursor;              self->state = LS_IDENTIFIER;          } else if (c == '0') { @@ -502,12 +504,6 @@ static int lex_handle_next(struct lex *const self, const int c)          } else if (c == ';') {              self->tok_offset = self->cursor;              self->state = LS_COMMENT_SEMICOLON; -        } else if (c == '*') { -            if (self->inside_line) { -                return lex_yield_error(self, c); -            } -            self->tok_offset = self->cursor; -            self->state = LS_COMMENT_ASTERISK;          } else if (c == ',') {              lex_yield_token(self, &(struct token){TT_COMMA, self->cursor, 1});          } else if (c == '.') { @@ -520,6 +516,15 @@ static int lex_handle_next(struct lex *const self, const int c)              lex_yield_token(self, &(struct token){TT_PLUS, self->cursor, 1});          } else if (c == '-') {              lex_yield_token(self, &(struct token){TT_MINUS, self->cursor, 1}); +        } else if (c == '*') { +            if (self->inside_line) { +                lex_yield_token( +                        self, &(struct token){TT_ASTERISK, self->cursor, 1}); +            } +            self->tok_offset = self->cursor; +            self->state = LS_COMMENT_ASTERISK; +        } else if (c == '/') { +            lex_yield_token(self, &(struct token){TT_SLASH, self->cursor, 1});          } else if (c == '=') {              lex_yield_token(self, &(struct token){TT_EQ, self->cursor, 1});          } else if (c == ':') { @@ -533,14 +538,15 @@ static int lex_handle_next(struct lex *const self, const int c)              self->state = LS_CR;          } else if (c == '\n') {              lex_yield_token(self, &(struct token){TT_NEWLINE, self->cursor, 1}); -        } else if (c == ' ') { +        } else if (c == '\\') { +            lex_yield_token(self, &(struct token){TT_ESCAPE, self->cursor, 1}); +        } else if (c == ' ' || c == '\t') {              self->tok_offset = self->cursor;              self->state = LS_SPACE; -        } else if (c == '\t') { -            self->tok_offset = self->cursor; -            self->state = LS_TAB;          } else if (c == EOF) {              self->state = LS_EOF; +        } else if (c == '\x1a') { +            // Ignore "End of file" character          } else {              return lex_yield_error(self, c);          } @@ -558,26 +564,20 @@ static int lex_handle_next(struct lex *const self, const int c)          }          break;      case LS_SPACE: // Accumulate multiple spaces into single token -        if (c != ' ') { -            const size_t length =  self->cursor - self->tok_offset; -            const struct token token = {TT_SPACE, self->tok_offset, length}; -            lex_yield_token(self, &token); -            self->state = LS_FREE; -            return lex_handle_next(self, c); -        } -        break; -    case LS_TAB: // Accumulate multiple tabs into single token -        if (c != '\t') { -            const size_t length =  self->cursor - self->tok_offset; -            const struct token token = {TT_TAB, self->tok_offset, length}; -            lex_yield_token(self, &token); +        if (c != ' ' && c != '\t') { +            // Only spaces and tabs at the beginning of the line are significant +            if (!self->inside_line) { +                const size_t length = self->cursor - self->tok_offset; +                const struct token token = {TT_SPACE, self->tok_offset, length}; +                lex_yield_token(self, &token); +            }              self->state = LS_FREE;              return lex_handle_next(self, c);          }          break;      case LS_IDENTIFIER:          if (!is_alphanum(c) && c != '_') { -            const size_t length =  self->cursor - self->tok_offset; +            const size_t length = self->cursor - self->tok_offset;              const struct token token = {TT_IDENTIFIER, self->tok_offset, length};              lex_yield_token(self, &token);              self->state = LS_FREE; @@ -604,7 +604,7 @@ static int lex_handle_next(struct lex *const self, const int c)          if (is_alphabetic(c) || c == '_') {              return lex_yield_error(self, c);          } else if (!is_oct(c)) { -            const size_t length =  self->cursor - self->tok_offset; +            const size_t length = self->cursor - self->tok_offset;              const struct token token = {TT_NUMOCT, self->tok_offset, length};              lex_yield_token(self, &token);              // This token is finished, handle this char in LS_FREE state @@ -619,7 +619,7 @@ static int lex_handle_next(struct lex *const self, const int c)              // Panik!              return lex_yield_error(self, c);          } else { -            const size_t length =  self->cursor - self->tok_offset; +            const size_t length = self->cursor - self->tok_offset;              const struct token token = {TT_NUMHEX, self->tok_offset, length};              lex_yield_token(self, &token);              // This token is finished, handle this char in LS_FREE state @@ -631,7 +631,7 @@ static int lex_handle_next(struct lex *const self, const int c)          if (is_alphabetic(c) || c == '_') {              return lex_yield_error(self, c);          } else if (!is_dec(c)) { -            const size_t length =  self->cursor - self->tok_offset; +            const size_t length = self->cursor - self->tok_offset;              const struct token token = {TT_NUMDEC, self->tok_offset, length};              lex_yield_token(self, &token);              // This token is finished, handle this char in LS_FREE state @@ -643,7 +643,7 @@ static int lex_handle_next(struct lex *const self, const int c)          if (c == '\\') {              self->state = LS_STRING_ESC;          } else if (c == '"') { -            const size_t length =  self->cursor - self->tok_offset + 1; +            const size_t length = self->cursor - self->tok_offset + 1;              const struct token token = {TT_STRING, self->tok_offset, length};              lex_yield_token(self, &token);              // This token is finished @@ -655,7 +655,7 @@ static int lex_handle_next(struct lex *const self, const int c)          break;      case LS_COMMENT_ASTERISK:          if (c == '\r' || c == '\n') { -            const size_t length =  self->cursor - self->tok_offset; +            const size_t length = self->cursor - self->tok_offset;              const struct token token = {TT_COMMENT_ASTERISK, self->tok_offset, length};              lex_yield_token(self, &token);              // This token is finished, handle this char in LS_FREE state @@ -665,7 +665,7 @@ static int lex_handle_next(struct lex *const self, const int c)          break;      case LS_COMMENT_SEMICOLON:          if (c == '\r' || c == '\n') { -            const size_t length =  self->cursor - self->tok_offset; +            const size_t length = self->cursor - self->tok_offset;              const struct token token = {TT_COMMENT_SEMICOLON, self->tok_offset, length};              lex_yield_token(self, &token);              // This token is finished, handle this char in LS_FREE state @@ -747,14 +747,108 @@ static int pars_init(struct pars *const self, const struct lex *const lex)      return OK;  } +static int pars_yield_error(struct pars *const self) +{ +    (void) self; +    // TODO +    return ERR; +} + +static int pars_parse_label(struct pars *const self) +{ +    const struct token label = self->lex->tokbuf[self->cur_tok_id++]; +    if (label.type != TT_IDENTIFIER) { +        return pars_yield_error(self); +    } +    // TODO +    return OK; +} + +static int pars_parse_instr(struct pars *const self, const struct token mnemo) +{ +    (void) self; +    (void) mnemo; +    // TODO +    return OK; +} + +static int pars_parse_direc(struct pars *const self) +{ +    const char *input = self->lex->input; +    const struct token direc = self->lex->tokbuf[self->cur_tok_id++]; +    if (direc.type != TT_IDENTIFIER) { +        return pars_yield_error(self); +    } +    if (0 == strcmp(input + direc.offset, "def")) { +    } else if (0 == strcmp(input + direc.offset, "opt")) { +    } else if (0 == strcmp(input + direc.offset, "file")) { +    } else if (0 == strcmp(input + direc.offset, "text")) { +    } else if (0 == strcmp(input + direc.offset, "align")) { +    } else if (0 == strcmp(input + direc.offset, "globl")) { +    } else if (0 == strcmp(input + direc.offset, "ln")) { +    } else if (0 == strcmp(input + direc.offset, "long")) { +    } else if (0 == strcmp(input + direc.offset, "word")) { +    } else if (0 == strcmp(input + direc.offset, "byte")) { +    } else if (0 == strcmp(input + direc.offset, "bin")) { +    } +    // TODO +    return OK; +} + +static int pars_parse_instr_or_direc(struct pars *const self) +{ +    struct token token; +    assert(token.type != TT_SPACE); +    if (token.type == TT_NEWLINE) { +        return OK; +    } else if (token.type == TT_DOT) { +        return pars_parse_direc(self); +    } else if (token.type == TT_IDENTIFIER) { +        return pars_parse_instr(self, token); +    } else { +        return pars_yield_error(self); +    } +    return OK; +} + +static int pars_parse_statement(struct pars *const self) +{ +    const size_t tokens_count = self->lex->tokbuf_size / +        (sizeof *self->lex->tokbuf); +    const struct token token = self->lex->tokbuf[self->cur_tok_id++]; +    const bool is_comment = token.type == TT_COMMENT_ASTERISK || +        token.type == TT_COMMENT_SEMICOLON; +    if (token.type == TT_SPACE) { +        return pars_parse_instr_or_direc(self); +    } else if (token.type == TT_IDENTIFIER) { +        return pars_parse_label(self); +    } else if (!is_comment) { +        if (self->cur_tok_id < tokens_count) { +            const struct token nl = self->lex->tokbuf[self->cur_tok_id++]; +            assert(nl.type == TT_NEWLINE); +        } +        return OK; +    } else if (token.type == TT_NEWLINE) { +        return OK; +    } +    return pars_yield_error(self); +} +  /** Run parser until the end of the input reached   * \returns OK if parsing finished successfully   * \returns ERR if error encountered and parsing cannot continue.   */  static int pars_run(struct pars *const self)  { -    (void) self; -    // TODO +    return OK; +    const size_t tokens_count = self->lex->tokbuf_size / +        (sizeof *self->lex->tokbuf); +    do { +        const int ret = pars_parse_statement(self); +        if (ret != OK) { +            return ret; +        } +    } while (self->cur_tok_id < tokens_count);      return OK;  } @@ -784,7 +878,7 @@ static int assem_emit(struct assem *const self, FILE *const stream)  {      if (TRACE_LEX) {          const struct lex *const lex = self->pars->lex; -        for (size_t i = 1; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) { +        for (size_t i = 0; i < lex->tokbuf_size / (sizeof *lex->tokbuf); i++) {              fprint_tok(lex->input, &lex->tokbuf[i], stream);          }      } | 
