diff options
author | Oxore <oxore@protonmail.com> | 2023-06-24 15:27:10 +0300 |
---|---|---|
committer | Oxore <oxore@protonmail.com> | 2023-06-24 15:30:41 +0300 |
commit | 19812eab123d347435929dd4f40649b467d3b457 (patch) | |
tree | f37f0694b553d3b069811c9d816ef47e77db7629 | |
parent | e0029661122547edf6261bd6a56ed7fc090170ff (diff) |
Change lexer according to documentation
-rw-r--r-- | main.c | 128 |
1 files changed, 102 insertions, 26 deletions
@@ -23,7 +23,6 @@ enum token_type { TT_NONE = 0, - TT_SPACE, TT_NEWLINE, TT_ESCAPE, TT_DOT, @@ -33,9 +32,18 @@ enum token_type { TT_ASTERISK, TT_SLASH, TT_EQ, + TT_EQ_DOUBLE, TT_COLON, TT_PERCENT, + TT_LEFT_SHIFT, + TT_RIGHT_SHIFT, TT_HASH, + TT_BANG, + TT_DOLLAR, + TT_TILDE, + TT_AMPERSAND, + TT_PIPE, + TT_CAP, TT_STRING, TT_IDENTIFIER, TT_NUMDEC, @@ -43,6 +51,10 @@ enum token_type { TT_NUMHEX, TT_LPAREN, TT_RPAREN, + TT_LBRACKET, + TT_RBRACKET, + TT_RBRACE, + TT_LBRACE, TT_COMMENT_ASTERISK, TT_COMMENT_SEMICOLON, }; @@ -61,7 +73,9 @@ enum lex_error { enum lex_state { LS_FREE = 0, LS_CR, - LS_SPACE, + LS_LEFT_SHIFT, + LS_RIGHT_SHIFT, + LS_EQ, LS_IDENTIFIER, LS_NUMOCTHEX, LS_NUMOCT, @@ -302,7 +316,6 @@ static const char *tok_kind_to_string(const enum token_type type) { switch (type) { case TT_NONE: return "NONE"; - case TT_SPACE: return "SPACE"; case TT_NEWLINE: return "NEWLINE"; case TT_ESCAPE: return "ESCAPE"; case TT_DOT: return "DOT"; @@ -312,16 +325,29 @@ static const char *tok_kind_to_string(const enum token_type type) case TT_ASTERISK: return "ASTERISK"; case TT_SLASH: return "SLASH"; case TT_EQ: return "EQ"; + case TT_EQ_DOUBLE: return "EQ_DOUBLE"; case TT_COLON: return "COLON"; case TT_PERCENT: return "PERCENT"; + case TT_LEFT_SHIFT: return "LEFT_SHIFT"; + case TT_RIGHT_SHIFT: return "RIGHT_SHIFT"; case TT_HASH: return "HASH"; + case TT_BANG: return "BANG"; + case TT_DOLLAR: return "DOLLAR"; + case TT_TILDE: return "TILDE"; + case TT_AMPERSAND: return "AMPERSAND"; + case TT_PIPE: return "PIPE"; + case TT_CAP: return "CAP"; case TT_STRING: return "STRING"; case TT_IDENTIFIER: return "IDENTIFIER"; case TT_NUMDEC: return "NUMDEC"; case TT_NUMOCT: return "NUMOCT"; case TT_NUMHEX: return "NUMHEX"; - case TT_LPAREN: return "PARENL"; - case TT_RPAREN: return "PARENR"; + case TT_LPAREN: return "LPAREN"; + case TT_RPAREN: return "RPAREN"; + case TT_LBRACKET: return "LBRACKET"; + case TT_RBRACKET: return "RBRACKET"; + case TT_LBRACE: return "LBRACE"; + case TT_RBRACE: return "RBRACE"; case TT_COMMENT_ASTERISK: return "COMMENT"; case TT_COMMENT_SEMICOLON: return "COMMENT"; } @@ -404,8 +430,12 @@ static const char *lex_state_error_string( return "';', '[0-9]' , ',', '.', '(', ')', '+', " "'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' " "or EOF"; + case LS_LEFT_SHIFT: + return "'<'"; + case LS_RIGHT_SHIFT: + return "'>'"; case LS_CR: - case LS_SPACE: + case LS_EQ: case LS_IDENTIFIER: case LS_STRING: case LS_STRING_ESC: @@ -498,12 +528,21 @@ static int lex_handle_next(struct lex *const self, const int c) } else if (is_dec(c)) { self->tok_offset = self->cursor; self->state = LS_NUMDEC; + } else if (c == '@') { + self->tok_offset = self->cursor; + self->state = LS_NUMOCT; } else if (c == '"') { self->tok_offset = self->cursor; self->state = LS_STRING; } else if (c == ';') { self->tok_offset = self->cursor; self->state = LS_COMMENT_SEMICOLON; + } else if (c == '<') { + self->tok_offset = self->cursor; + self->state = LS_LEFT_SHIFT; + } else if (c == '>') { + self->tok_offset = self->cursor; + self->state = LS_RIGHT_SHIFT; } else if (c == ',') { lex_yield_token(self, &(struct token){TT_COMMA, self->cursor, 1}); } else if (c == '.') { @@ -512,6 +551,14 @@ static int lex_handle_next(struct lex *const self, const int c) lex_yield_token(self, &(struct token){TT_LPAREN, self->cursor, 1}); } else if (c == ')') { lex_yield_token(self, &(struct token){TT_RPAREN, self->cursor, 1}); + } else if (c == '[') { + lex_yield_token(self, &(struct token){TT_LBRACKET, self->cursor, 1}); + } else if (c == ']') { + lex_yield_token(self, &(struct token){TT_RBRACKET, self->cursor, 1}); + } else if (c == '{') { + lex_yield_token(self, &(struct token){TT_LBRACE, self->cursor, 1}); + } else if (c == '{') { + lex_yield_token(self, &(struct token){TT_RBRACE, self->cursor, 1}); } else if (c == '+') { lex_yield_token(self, &(struct token){TT_PLUS, self->cursor, 1}); } else if (c == '-') { @@ -520,19 +567,33 @@ static int lex_handle_next(struct lex *const self, const int c) if (self->inside_line) { lex_yield_token( self, &(struct token){TT_ASTERISK, self->cursor, 1}); + } else { + self->tok_offset = self->cursor; + self->state = LS_COMMENT_ASTERISK; } - self->tok_offset = self->cursor; - self->state = LS_COMMENT_ASTERISK; } else if (c == '/') { lex_yield_token(self, &(struct token){TT_SLASH, self->cursor, 1}); } else if (c == '=') { - lex_yield_token(self, &(struct token){TT_EQ, self->cursor, 1}); + self->tok_offset = self->cursor; + self->state = LS_EQ; } else if (c == ':') { lex_yield_token(self, &(struct token){TT_COLON, self->cursor, 1}); } else if (c == '%') { lex_yield_token(self, &(struct token){TT_PERCENT, self->cursor, 1}); } else if (c == '#') { lex_yield_token(self, &(struct token){TT_HASH, self->cursor, 1}); + } else if (c == '!') { + lex_yield_token(self, &(struct token){TT_BANG, self->cursor, 1}); + } else if (c == '$') { + lex_yield_token(self, &(struct token){TT_DOLLAR, self->cursor, 1}); + } else if (c == '~') { + lex_yield_token(self, &(struct token){TT_TILDE, self->cursor, 1}); + } else if (c == '&') { + lex_yield_token(self, &(struct token){TT_AMPERSAND, self->cursor, 1}); + } else if (c == '|') { + lex_yield_token(self, &(struct token){TT_PIPE, self->cursor, 1}); + } else if (c == '^') { + lex_yield_token(self, &(struct token){TT_CAP, self->cursor, 1}); } else if (c == '\r') { self->tok_offset = self->cursor; self->state = LS_CR; @@ -541,8 +602,7 @@ static int lex_handle_next(struct lex *const self, const int c) } else if (c == '\\') { lex_yield_token(self, &(struct token){TT_ESCAPE, self->cursor, 1}); } else if (c == ' ' || c == '\t') { - self->tok_offset = self->cursor; - self->state = LS_SPACE; + // ignore spaces and tabs } else if (c == EOF) { self->state = LS_EOF; } else if (c == '\x1a') { @@ -563,15 +623,36 @@ static int lex_handle_next(struct lex *const self, const int c) } } break; - case LS_SPACE: // Accumulate multiple spaces into single token - if (c != ' ' && c != '\t') { - // Only spaces and tabs at the beginning of the line are significant - if (!self->inside_line) { - const size_t length = self->cursor - self->tok_offset; - const struct token token = {TT_SPACE, self->tok_offset, length}; - lex_yield_token(self, &token); - } + case LS_LEFT_SHIFT: + if (c == '<') { + const size_t length = self->cursor - self->tok_offset; + const struct token token = {TT_LEFT_SHIFT, self->tok_offset, length}; + lex_yield_token(self, &token); + self->state = LS_FREE; + } else { + return lex_yield_error(self, c); + } + break; + case LS_RIGHT_SHIFT: + if (c == '>') { + const size_t length = self->cursor - self->tok_offset; + const struct token token = {TT_RIGHT_SHIFT, self->tok_offset, length}; + lex_yield_token(self, &token); self->state = LS_FREE; + } else { + return lex_yield_error(self, c); + } + break; + case LS_EQ: + { + const size_t length = (c == '=') ? 2 : 1; + const enum token_type type = (c == '=') ? TT_EQ_DOUBLE : TT_EQ; + const struct token token = {type, self->tok_offset, length}; + lex_yield_token(self, &token); + } + self->state = LS_FREE; + if (c != '=') { + // It is just single eq "=", handle this char in LS_FREE state then return lex_handle_next(self, c); } break; @@ -798,10 +879,7 @@ static int pars_parse_direc(struct pars *const self) static int pars_parse_instr_or_direc(struct pars *const self) { struct token token; - assert(token.type != TT_SPACE); - if (token.type == TT_NEWLINE) { - return OK; - } else if (token.type == TT_DOT) { + if (token.type == TT_DOT) { return pars_parse_direc(self); } else if (token.type == TT_IDENTIFIER) { return pars_parse_instr(self, token); @@ -818,9 +896,7 @@ static int pars_parse_statement(struct pars *const self) const struct token token = self->lex->tokbuf[self->cur_tok_id++]; const bool is_comment = token.type == TT_COMMENT_ASTERISK || token.type == TT_COMMENT_SEMICOLON; - if (token.type == TT_SPACE) { - return pars_parse_instr_or_direc(self); - } else if (token.type == TT_IDENTIFIER) { + if (token.type == TT_IDENTIFIER) { return pars_parse_label(self); } else if (!is_comment) { if (self->cur_tok_id < tokens_count) { |