summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOxore <oxore@protonmail.com>2023-06-24 15:27:10 +0300
committerOxore <oxore@protonmail.com>2023-06-24 15:30:41 +0300
commit19812eab123d347435929dd4f40649b467d3b457 (patch)
treef37f0694b553d3b069811c9d816ef47e77db7629
parente0029661122547edf6261bd6a56ed7fc090170ff (diff)
Change lexer according to documentation
-rw-r--r--main.c128
1 files changed, 102 insertions, 26 deletions
diff --git a/main.c b/main.c
index 2b3ab2f..48c1d69 100644
--- a/main.c
+++ b/main.c
@@ -23,7 +23,6 @@
enum token_type {
TT_NONE = 0,
- TT_SPACE,
TT_NEWLINE,
TT_ESCAPE,
TT_DOT,
@@ -33,9 +32,18 @@ enum token_type {
TT_ASTERISK,
TT_SLASH,
TT_EQ,
+ TT_EQ_DOUBLE,
TT_COLON,
TT_PERCENT,
+ TT_LEFT_SHIFT,
+ TT_RIGHT_SHIFT,
TT_HASH,
+ TT_BANG,
+ TT_DOLLAR,
+ TT_TILDE,
+ TT_AMPERSAND,
+ TT_PIPE,
+ TT_CAP,
TT_STRING,
TT_IDENTIFIER,
TT_NUMDEC,
@@ -43,6 +51,10 @@ enum token_type {
TT_NUMHEX,
TT_LPAREN,
TT_RPAREN,
+ TT_LBRACKET,
+ TT_RBRACKET,
+ TT_RBRACE,
+ TT_LBRACE,
TT_COMMENT_ASTERISK,
TT_COMMENT_SEMICOLON,
};
@@ -61,7 +73,9 @@ enum lex_error {
enum lex_state {
LS_FREE = 0,
LS_CR,
- LS_SPACE,
+ LS_LEFT_SHIFT,
+ LS_RIGHT_SHIFT,
+ LS_EQ,
LS_IDENTIFIER,
LS_NUMOCTHEX,
LS_NUMOCT,
@@ -302,7 +316,6 @@ static const char *tok_kind_to_string(const enum token_type type)
{
switch (type) {
case TT_NONE: return "NONE";
- case TT_SPACE: return "SPACE";
case TT_NEWLINE: return "NEWLINE";
case TT_ESCAPE: return "ESCAPE";
case TT_DOT: return "DOT";
@@ -312,16 +325,29 @@ static const char *tok_kind_to_string(const enum token_type type)
case TT_ASTERISK: return "ASTERISK";
case TT_SLASH: return "SLASH";
case TT_EQ: return "EQ";
+ case TT_EQ_DOUBLE: return "EQ_DOUBLE";
case TT_COLON: return "COLON";
case TT_PERCENT: return "PERCENT";
+ case TT_LEFT_SHIFT: return "LEFT_SHIFT";
+ case TT_RIGHT_SHIFT: return "RIGHT_SHIFT";
case TT_HASH: return "HASH";
+ case TT_BANG: return "BANG";
+ case TT_DOLLAR: return "DOLLAR";
+ case TT_TILDE: return "TILDE";
+ case TT_AMPERSAND: return "AMPERSAND";
+ case TT_PIPE: return "PIPE";
+ case TT_CAP: return "CAP";
case TT_STRING: return "STRING";
case TT_IDENTIFIER: return "IDENTIFIER";
case TT_NUMDEC: return "NUMDEC";
case TT_NUMOCT: return "NUMOCT";
case TT_NUMHEX: return "NUMHEX";
- case TT_LPAREN: return "PARENL";
- case TT_RPAREN: return "PARENR";
+ case TT_LPAREN: return "LPAREN";
+ case TT_RPAREN: return "RPAREN";
+ case TT_LBRACKET: return "LBRACKET";
+ case TT_RBRACKET: return "RBRACKET";
+ case TT_LBRACE: return "LBRACE";
+ case TT_RBRACE: return "RBRACE";
case TT_COMMENT_ASTERISK: return "COMMENT";
case TT_COMMENT_SEMICOLON: return "COMMENT";
}
@@ -404,8 +430,12 @@ static const char *lex_state_error_string(
return "';', '[0-9]' , ',', '.', '(', ')', '+', "
"'-', '=', ':', '%', '#', ' ', '\\t', '\\r', '\\n', '\\r\\n' "
"or EOF";
+ case LS_LEFT_SHIFT:
+ return "'<'";
+ case LS_RIGHT_SHIFT:
+ return "'>'";
case LS_CR:
- case LS_SPACE:
+ case LS_EQ:
case LS_IDENTIFIER:
case LS_STRING:
case LS_STRING_ESC:
@@ -498,12 +528,21 @@ static int lex_handle_next(struct lex *const self, const int c)
} else if (is_dec(c)) {
self->tok_offset = self->cursor;
self->state = LS_NUMDEC;
+ } else if (c == '@') {
+ self->tok_offset = self->cursor;
+ self->state = LS_NUMOCT;
} else if (c == '"') {
self->tok_offset = self->cursor;
self->state = LS_STRING;
} else if (c == ';') {
self->tok_offset = self->cursor;
self->state = LS_COMMENT_SEMICOLON;
+ } else if (c == '<') {
+ self->tok_offset = self->cursor;
+ self->state = LS_LEFT_SHIFT;
+ } else if (c == '>') {
+ self->tok_offset = self->cursor;
+ self->state = LS_RIGHT_SHIFT;
} else if (c == ',') {
lex_yield_token(self, &(struct token){TT_COMMA, self->cursor, 1});
} else if (c == '.') {
@@ -512,6 +551,14 @@ static int lex_handle_next(struct lex *const self, const int c)
lex_yield_token(self, &(struct token){TT_LPAREN, self->cursor, 1});
} else if (c == ')') {
lex_yield_token(self, &(struct token){TT_RPAREN, self->cursor, 1});
+ } else if (c == '[') {
+ lex_yield_token(self, &(struct token){TT_LBRACKET, self->cursor, 1});
+ } else if (c == ']') {
+ lex_yield_token(self, &(struct token){TT_RBRACKET, self->cursor, 1});
+ } else if (c == '{') {
+ lex_yield_token(self, &(struct token){TT_LBRACE, self->cursor, 1});
+ } else if (c == '{') {
+ lex_yield_token(self, &(struct token){TT_RBRACE, self->cursor, 1});
} else if (c == '+') {
lex_yield_token(self, &(struct token){TT_PLUS, self->cursor, 1});
} else if (c == '-') {
@@ -520,19 +567,33 @@ static int lex_handle_next(struct lex *const self, const int c)
if (self->inside_line) {
lex_yield_token(
self, &(struct token){TT_ASTERISK, self->cursor, 1});
+ } else {
+ self->tok_offset = self->cursor;
+ self->state = LS_COMMENT_ASTERISK;
}
- self->tok_offset = self->cursor;
- self->state = LS_COMMENT_ASTERISK;
} else if (c == '/') {
lex_yield_token(self, &(struct token){TT_SLASH, self->cursor, 1});
} else if (c == '=') {
- lex_yield_token(self, &(struct token){TT_EQ, self->cursor, 1});
+ self->tok_offset = self->cursor;
+ self->state = LS_EQ;
} else if (c == ':') {
lex_yield_token(self, &(struct token){TT_COLON, self->cursor, 1});
} else if (c == '%') {
lex_yield_token(self, &(struct token){TT_PERCENT, self->cursor, 1});
} else if (c == '#') {
lex_yield_token(self, &(struct token){TT_HASH, self->cursor, 1});
+ } else if (c == '!') {
+ lex_yield_token(self, &(struct token){TT_BANG, self->cursor, 1});
+ } else if (c == '$') {
+ lex_yield_token(self, &(struct token){TT_DOLLAR, self->cursor, 1});
+ } else if (c == '~') {
+ lex_yield_token(self, &(struct token){TT_TILDE, self->cursor, 1});
+ } else if (c == '&') {
+ lex_yield_token(self, &(struct token){TT_AMPERSAND, self->cursor, 1});
+ } else if (c == '|') {
+ lex_yield_token(self, &(struct token){TT_PIPE, self->cursor, 1});
+ } else if (c == '^') {
+ lex_yield_token(self, &(struct token){TT_CAP, self->cursor, 1});
} else if (c == '\r') {
self->tok_offset = self->cursor;
self->state = LS_CR;
@@ -541,8 +602,7 @@ static int lex_handle_next(struct lex *const self, const int c)
} else if (c == '\\') {
lex_yield_token(self, &(struct token){TT_ESCAPE, self->cursor, 1});
} else if (c == ' ' || c == '\t') {
- self->tok_offset = self->cursor;
- self->state = LS_SPACE;
+ // ignore spaces and tabs
} else if (c == EOF) {
self->state = LS_EOF;
} else if (c == '\x1a') {
@@ -563,15 +623,36 @@ static int lex_handle_next(struct lex *const self, const int c)
}
}
break;
- case LS_SPACE: // Accumulate multiple spaces into single token
- if (c != ' ' && c != '\t') {
- // Only spaces and tabs at the beginning of the line are significant
- if (!self->inside_line) {
- const size_t length = self->cursor - self->tok_offset;
- const struct token token = {TT_SPACE, self->tok_offset, length};
- lex_yield_token(self, &token);
- }
+ case LS_LEFT_SHIFT:
+ if (c == '<') {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_LEFT_SHIFT, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ self->state = LS_FREE;
+ } else {
+ return lex_yield_error(self, c);
+ }
+ break;
+ case LS_RIGHT_SHIFT:
+ if (c == '>') {
+ const size_t length = self->cursor - self->tok_offset;
+ const struct token token = {TT_RIGHT_SHIFT, self->tok_offset, length};
+ lex_yield_token(self, &token);
self->state = LS_FREE;
+ } else {
+ return lex_yield_error(self, c);
+ }
+ break;
+ case LS_EQ:
+ {
+ const size_t length = (c == '=') ? 2 : 1;
+ const enum token_type type = (c == '=') ? TT_EQ_DOUBLE : TT_EQ;
+ const struct token token = {type, self->tok_offset, length};
+ lex_yield_token(self, &token);
+ }
+ self->state = LS_FREE;
+ if (c != '=') {
+ // It is just single eq "=", handle this char in LS_FREE state then
return lex_handle_next(self, c);
}
break;
@@ -798,10 +879,7 @@ static int pars_parse_direc(struct pars *const self)
static int pars_parse_instr_or_direc(struct pars *const self)
{
struct token token;
- assert(token.type != TT_SPACE);
- if (token.type == TT_NEWLINE) {
- return OK;
- } else if (token.type == TT_DOT) {
+ if (token.type == TT_DOT) {
return pars_parse_direc(self);
} else if (token.type == TT_IDENTIFIER) {
return pars_parse_instr(self, token);
@@ -818,9 +896,7 @@ static int pars_parse_statement(struct pars *const self)
const struct token token = self->lex->tokbuf[self->cur_tok_id++];
const bool is_comment = token.type == TT_COMMENT_ASTERISK ||
token.type == TT_COMMENT_SEMICOLON;
- if (token.type == TT_SPACE) {
- return pars_parse_instr_or_direc(self);
- } else if (token.type == TT_IDENTIFIER) {
+ if (token.type == TT_IDENTIFIER) {
return pars_parse_label(self);
} else if (!is_comment) {
if (self->cur_tok_id < tokens_count) {