summaryrefslogtreecommitdiff
path: root/src/tracetab.cpp
diff options
context:
space:
mode:
authorOxore <oxore@protonmail.com>2025-01-03 17:07:00 +0300
committerOxore <oxore@protonmail.com>2025-01-07 14:39:01 +0300
commitcb96278e25140cfcc1afc22df2102bcf3b6ae38c (patch)
tree9e93bd8a5fb4d5fbc177924b6b25ca8cd04e7fd7 /src/tracetab.cpp
parent810dc87cd5173f8cfc81c774fd49cf8f928a9ae8 (diff)
Impl extended trace table format parser
Diffstat (limited to 'src/tracetab.cpp')
-rw-r--r--src/tracetab.cpp1073
1 files changed, 1073 insertions, 0 deletions
diff --git a/src/tracetab.cpp b/src/tracetab.cpp
new file mode 100644
index 0000000..5deb8ea
--- /dev/null
+++ b/src/tracetab.cpp
@@ -0,0 +1,1073 @@
+/* SPDX-License-Identifier: Unlicense
+ */
+
+#include "tracetab.h"
+#include "vec.h"
+#include "debug.h"
+
+#include <cassert>
+#include <cerrno>
+#include <cstdlib>
+#include <cstring>
+
+TraceTable::~TraceTable(void)
+{
+ if (_types) {
+ free(_types);
+ }
+ if (_nodes) {
+ free(_nodes);
+ }
+ if (_shstr) {
+ free(_shstr);
+ }
+ _types = nullptr;
+ _nodes = nullptr;
+ _shstr = nullptr;
+ _types_count = 0;
+ _nodes_count = 0;
+}
+
+const char *const g_escape_table[256] = {
+ "\\x00", "\\x01", "\\x02", "\\x03", "\\x04", "\\x05", "\\x06", "\\x07",
+ "\\x08", "\\t", "\\n", "\\x0b", "\\x0c", "\\r", "\\x0e", "\\x0f", "\\x10",
+ "\\x11", "\\x12", "\\x13", "\\x14", "\\x15", "\\x16", "\\x17", "\\x18",
+ "\\x19", "\\x1a", "\\x1b", "\\x1c", "\\x1d", "\\x1e", "\\x1f", " ", "!",
+ "\\\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "0",
+ "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "\\<", "=", "\\>", "?",
+ "@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
+ "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\\\",
+ "]", "^", "_", "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
+ "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
+ "{", "|", "}", "~", "\\x7f", "\\x80", "\\x81", "\\x82", "\\x83", "\\x84",
+ "\\x85", "\\x86", "\\x87", "\\x88", "\\x89", "\\x8a", "\\x8b", "\\x8c",
+ "\\x8d", "\\x8e", "\\x8f", "\\x90", "\\x91", "\\x92", "\\x93", "\\x94",
+ "\\x95", "\\x96", "\\x97", "\\x98", "\\x99", "\\x9a", "\\x9b", "\\x9c",
+ "\\x9d", "\\x9e", "\\x9f", "\\xa0", "\\xa1", "\\xa2", "\\xa3", "\\xa4",
+ "\\xa5", "\\xa6", "\\xa7", "\\xa8", "\\xa9", "\\xaa", "\\xab", "\\xac",
+ "\\xad", "\\xae", "\\xaf", "\\xb0", "\\xb1", "\\xb2", "\\xb3", "\\xb4",
+ "\\xb5", "\\xb6", "\\xb7", "\\xb8", "\\xb9", "\\xba", "\\xbb", "\\xbc",
+ "\\xbd", "\\xbe", "\\xbf", "\\xc0", "\\xc1", "\\xc2", "\\xc3", "\\xc4",
+ "\\xc5", "\\xc6", "\\xc7", "\\xc8", "\\xc9", "\\xca", "\\xcb", "\\xcc",
+ "\\xcd", "\\xce", "\\xcf", "\\xd0", "\\xd1", "\\xd2", "\\xd3", "\\xd4",
+ "\\xd5", "\\xd6", "\\xd7", "\\xd8", "\\xd9", "\\xda", "\\xdb", "\\xdc",
+ "\\xdd", "\\xde", "\\xdf", "\\xe0", "\\xe1", "\\xe2", "\\xe3", "\\xe4",
+ "\\xe5", "\\xe6", "\\xe7", "\\xe8", "\\xe9", "\\xea", "\\xeb", "\\xec",
+ "\\xed", "\\xee", "\\xef", "\\xf0", "\\xf1", "\\xf2", "\\xf3", "\\xf4",
+ "\\xf5", "\\xf6", "\\xf7", "\\xf8", "\\xf9", "\\xfa", "\\xfb", "\\xfc",
+ "\\xfd", "\\xfe",
+};
+
+static char *escapeStr(const char *in, size_t len)
+{
+ char *out = nullptr;
+ size_t out_size = 0;
+ FILE *f = open_memstream(&out, &out_size);
+ if (nullptr == f) {
+ return out;
+ }
+ for (size_t i = 0; i < len; i++) {
+ const char *escaped = g_escape_table[static_cast<unsigned char>(in[i])];
+ const int res = fwrite(escaped, strlen(escaped), 1, f);
+ ASSERT(res == 1), (void)res;
+ }
+ fclose(f);
+ return out;
+}
+
+enum class TokenKind {
+ /// Used to indicate end of data
+ kNone = 0,
+ /// Used to indicate unexpected end of data or char
+ kError,
+ /// "0x[0-9a-fA-f]+"
+ kNumHex,
+ /// "0|[1-9][0-9]*"
+ kNumDec,
+ /// "0[0-7]+"
+ kNumOct,
+ /// "0b[01]+"
+ kNumBin,
+ /// "[a-zA-Z_][0-9a-zA-Z_]*"
+ kAlphaNum,
+ /// ","
+ kComma,
+ /// "["
+ kLBracket,
+ /// "]"
+ kRBracket,
+ /// "#.*$"
+ kComment,
+ /// "\"
+ kBackslash,
+ /// "\n|\r|\r\n"
+ kNewLine,
+};
+
+struct Token {
+ using T = Token;
+ using K = TokenKind;
+ K kind{};
+ size_t pos{}, len{};
+ constexpr const char *Str() const {
+ switch(kind) {
+ case K::kNone: return "<None>";
+ case K::kError: return "<Error>";
+ case K::kNumHex: return "hexadecimal numeric";
+ case K::kNumDec: return "decimal numeric";
+ case K::kNumOct: return "octal numeric";
+ case K::kNumBin: return "binary numeric";
+ case K::kAlphaNum: return "alphanumeric";
+ case K::kComma: return "`,`";
+ case K::kLBracket: return "`[`";
+ case K::kRBracket: return "`]`";
+ case K::kComment: return "comment";
+ case K::kBackslash: return "backslash";
+ case K::kNewLine: return "newline";
+ }
+ UNREACHABLE();
+ return "<undefined>";
+ }
+ constexpr bool IsNum() const
+ {
+ return kind == K::kNumHex || kind == K::kNumDec || kind == K::kNumOct || kind == K::kNumBin;
+ }
+ static constexpr T None(size_t pos) { return T{ K::kNone, pos, 0}; }
+ static constexpr T NumHex(size_t pos, size_t len) { return T{ K::kNumHex, pos, len}; }
+ static constexpr T NumDec(size_t pos, size_t len) { return T{ K::kNumDec, pos, len}; }
+ static constexpr T NumOct(size_t pos, size_t len) { return T{ K::kNumOct, pos, len}; }
+ static constexpr T NumBin(size_t pos, size_t len) { return T{ K::kNumBin, pos, len}; }
+ static constexpr T AlphaNum(size_t pos, size_t len) { return T{ K::kAlphaNum, pos, len}; }
+ static constexpr T Comma(size_t pos) { return T{ K::kComma, pos, 1}; }
+ static constexpr T LBracket(size_t pos) { return T{ K::kLBracket, pos, 1}; }
+ static constexpr T RBracket(size_t pos) { return T{ K::kRBracket, pos, 1}; }
+ static constexpr T Comment(size_t pos, size_t len) { return T{ K::kComment, pos, len}; }
+ static constexpr T Backslash(size_t pos) { return T{ K::kBackslash, pos, 1}; }
+};
+
+static const char *ParseTypeFromStr(
+ const char *input, size_t len, TraceNodeKind &k, DataType &dt)
+{
+ struct {
+ const char *str;
+ DataTypeKind type;
+ } data[] {
+ { "blob", DataTypeKind::kBlob, },
+ { "str", DataTypeKind::kStr, },
+ { "strz", DataTypeKind::kStrz, },
+ };
+ const char *intable[] { "ptr", "u8", "u16", "u32", };
+ static char err[256];
+ TRACE("Token: \"%.*s\"", static_cast<int>(len), input);
+ if (0 == strncmp("fn", input, len)) {
+ k = TraceNodeKind::kFunction;
+ return nullptr;
+ } else if (0 == strncmp("pc", input, len)) {
+ k = TraceNodeKind::kPc;
+ return nullptr;
+ }
+ for (size_t i = 0; i < (sizeof data) / (sizeof *data); i++) {
+ if (0 == strncmp(data[i].str, input, len)) {
+ k = TraceNodeKind::kData;
+ dt = DataType{ data[i].type };
+ return nullptr;
+ }
+ }
+ for (size_t i = 0; i < (sizeof intable) / (sizeof *intable); i++) {
+ const char *t = intable[i];
+ if (0 == strncmp(t, input, len)) {
+ snprintf(
+ err, sizeof err,
+ "`%s` trace node type only allowed in a table, "
+ "use `[%s]` syntax instead", t, t);
+ return err;
+ }
+ }
+ snprintf(err, sizeof err, "unknown trace node type");
+ return err;
+}
+
+static const char *ParseTypeFromToken(
+ const Token &t, const char *input, TraceNodeKind &k, DataType &dt)
+{
+ return ParseTypeFromStr(input + t.pos, t.len, k, dt);
+}
+
+static const char *ParseTableTypeFromStr(
+ const char *input, size_t len, DataType &dt)
+{
+ struct {
+ const char *str;
+ DataTypeKind type;
+ } data[] {
+ { "ptr", DataTypeKind::kPtr, },
+ { "u32", DataTypeKind::kU32, },
+ { "u16", DataTypeKind::kU16, },
+ { "u8", DataTypeKind::kU8, },
+ };
+ static char err[256];
+ TRACE("Token: \"%.*s\"", static_cast<int>(len), input);
+ for (size_t i = 0; i < (sizeof data) / (sizeof *data); i++) {
+ if (0 == strncmp(data[i].str, input, len)) {
+ dt = DataType{ data[i].type, 1 };
+ return nullptr;
+ }
+ }
+ snprintf(err, sizeof err, "unknown table type");
+ return err;
+}
+
+static const char *ParseTableTypeFromToken(
+ const Token &t, const char *input, DataType &dt)
+{
+ return ParseTableTypeFromStr(input + t.pos, t.len, dt);
+}
+
+enum class TokenizerState {
+ kFree = 0,
+ kNumAmbiguous,
+ kNumHexStillNoDigits,
+ kNumHex,
+ kNumDec,
+ kNumOct,
+ kNumBinStillNoDigits,
+ kNumBin,
+ kAlphaNum,
+ kComment,
+ kNewLine,
+ kError,
+};
+
+struct LinePosInfo {
+ size_t lineno;
+ size_t col;
+ size_t pos;
+};
+
+class Tokenizer {
+ FILE *_errstream{};
+ const char *_filename{};
+ size_t _pos{}, _token_pos{};
+ TokenizerState _state{};
+ TokenKind handleChar(char c, bool is_last);
+ TokenKind errorExpect(void);
+ char lookAhead() const { return in[_pos + 1]; }
+ void printError(const LinePosInfo l, size_t underline_len, const char *str);
+public:
+ const char *in{};
+ size_t in_size{};
+ Tokenizer(){}
+ Tokenizer(const void *input, size_t size, FILE *errstream, const char *filename)
+ : _errstream(errstream)
+ , _filename(filename)
+ , in(static_cast<const char *>(input))
+ , in_size(size)
+ {}
+ Token Next()
+ {
+ while (_pos < in_size) {
+ const char c = in[_pos];
+ const TokenKind tk = handleChar(c, _pos + 1 >= in_size);
+ _pos++;
+ if (tk != TokenKind::kNone) {
+ return Token{ tk, _token_pos, _pos - _token_pos };
+ }
+ }
+ return Token::None(_pos);
+ }
+ LinePosInfo GetLinePosInfo(const size_t pos)
+ {
+ LinePosInfo l{};
+ bool cr = false;
+ for (size_t i = 0; i < pos; i++) {
+ const char c = in[i];
+ if (c == '\r') {
+ cr = true;
+ l.pos = i + 1;
+ l.lineno++;
+ l.col = 0;
+ } else if (c == '\n') {
+ if (!cr) {
+ l.lineno++;
+ }
+ cr = false;
+ l.pos = i + 1;
+ l.col = 0;
+ } else {
+ cr = false;
+ l.col++;
+ }
+ }
+ return l;
+ }
+ void PrintError(const Token& t, const char *errstr)
+ {
+ return printError(GetLinePosInfo(t.pos), t.len, errstr);
+ }
+ void PrintError(const char *errstr)
+ {
+ return printError(GetLinePosInfo(_pos), 1, errstr);
+ }
+};
+
+const char *TokenizerStateToExpectedText(TokenizerState s)
+{
+ switch (s) {
+ case TokenizerState::kFree:
+ return "digit, alphabetic, `_`, `,`, `[`, `]`, `#`, space, tab, CR or LF";
+ case TokenizerState::kNumAmbiguous:
+ return "`x`, `b` or octal digit";
+ case TokenizerState::kNumHexStillNoDigits:
+ return "hexadecimal digit";
+ case TokenizerState::kNumHex:
+ return "hexadecimal digit, `,`, `[`, `]`, `#`, space, tab, CR or LF";
+ case TokenizerState::kNumDec:
+ return "decimal digit, `,`, `[`, `]`, `#`, space, tab, CR or LF";
+ case TokenizerState::kNumOct:
+ return "octal digit, `,`, `[`, `]`, `#`, space, tab, CR or LF";
+ case TokenizerState::kNumBinStillNoDigits:
+ return "binary digit";
+ case TokenizerState::kNumBin:
+ return "binary digit, `,`, `[`, `]`, `#`, space, tab, CR or LF";
+ case TokenizerState::kAlphaNum:
+ return "decimal digit, alphabetic, `_`, `,`, `[`, `]`, `#`, space, tab, CR or LF";
+ case TokenizerState::kComment:
+ UNREACHABLE();
+ return "<Comment>";
+ case TokenizerState::kNewLine:
+ UNREACHABLE();
+ return "<NewLine>";
+ case TokenizerState::kError:
+ UNREACHABLE();
+ return "<Error>";
+ }
+ UNREACHABLE();
+ return "<undefined>";
+}
+
+static size_t FindLineLength(const char *const str)
+{
+ for (size_t i = 0;; i++) {
+ const char c = str[i];
+ if (c == '\n' || c == '\r' || c == '\000') {
+ return i;
+ }
+ }
+ return 0;
+}
+
+void Tokenizer::printError(const LinePosInfo l, size_t underline_len, const char *err)
+{
+ const char *name = _filename ? _filename : "<stdin>";
+ fprintf(_errstream, "%s:%zu:%zu: error: %s\n", name, l.lineno, l.col, err);
+ const char *const line = in + l.pos;
+ const int line_length = FindLineLength(line);
+ fprintf(_errstream, "%5lu | %.*s\n", l.lineno + 1, line_length, line);
+ fputs(" | ", _errstream);
+ for (size_t i = 0; i < l.col; i++) {
+ if (in[l.pos + i] == '\t') {
+ fputc('\t', _errstream);
+ } else {
+ fputc(' ', _errstream);
+ }
+ }
+ fputc('^', _errstream);
+ for (size_t i = 1; i < underline_len; i++) {
+ fputc('~', _errstream);
+ }
+ fputc('\n', _errstream);
+}
+
+TokenKind Tokenizer::errorExpect(void)
+{
+ _state = TokenizerState::kError;
+ size_t size{};
+ char *errstr{};
+ FILE *memstream = open_memstream(&errstr, &size);
+ if (nullptr == memstream) {
+ return TokenKind::kError;
+ }
+ fprintf(
+ memstream,
+ "unexpected char `%s`, expected %s\n",
+ g_escape_table[static_cast<unsigned char>(in[_pos])],
+ TokenizerStateToExpectedText(_state));
+ fclose(memstream);
+ PrintError(errstr);
+ free(errstr);
+ return TokenKind::kError;
+}
+
+static constexpr bool IsSpace(char c)
+{
+ return c == ' ' || c == '\t';
+}
+
+static constexpr bool IsNewLine(char c)
+{
+ return c == '\n' || c == '\r';
+}
+
+static constexpr bool IsValidSequenceBreaker(char c)
+{
+ return c == ',' || c == '[' || c == ']' || c == '#' || c == '\\'
+ || IsNewLine(c) || IsSpace(c);
+}
+
+static constexpr bool IsAlphabetic(char c)
+{
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_');
+}
+
+static constexpr bool IsAlphaNumeric(char c)
+{
+ return IsAlphabetic(c) || (c >= '0' && c <= '9');
+}
+
+static constexpr bool IsDecimal(char c)
+{
+ return (c >= '0' && c <= '9');
+}
+
+static constexpr bool IsHexadecimal(char c)
+{
+ return IsDecimal(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+
+static constexpr bool IsOctal(char c)
+{
+ return (c >= '0' && c <= '7');
+}
+
+static constexpr bool IsBinary(char c)
+{
+ return c == '0' || c == '1';
+}
+
+TokenKind Tokenizer::handleChar(const char c, const bool is_last)
+{
+ switch (_state) {
+ case TokenizerState::kFree:
+ TRACE("kFree %zu", _pos);
+ _token_pos = _pos;
+ if (c == '\n') {
+ return TokenKind::kNewLine;
+ } else if (c == '\r') {
+ if (is_last || '\n' != lookAhead()) {
+ return TokenKind::kNewLine;
+ }
+ _state = TokenizerState::kNewLine;
+ return TokenKind::kNone;
+ } else if (c == ',') {
+ return TokenKind::kComma;
+ } else if (c == '[') {
+ return TokenKind::kLBracket;
+ } else if (c == ']') {
+ return TokenKind::kRBracket;
+ } else if (c == '#') {
+ if (is_last || IsNewLine(lookAhead())) {
+ return TokenKind::kComment;
+ }
+ _state = TokenizerState::kComment;
+ return TokenKind::kNone;
+ } else if (c == '\\') {
+ return TokenKind::kBackslash;
+ } else if (c == '0') {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ return TokenKind::kNumDec;
+ }
+ _state = TokenizerState::kNumAmbiguous;
+ return TokenKind::kNone;
+ } else if (IsDecimal(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ return TokenKind::kNumDec;
+ }
+ _state = TokenizerState::kNumDec;
+ return TokenKind::kNone;
+ } else if (IsAlphaNumeric(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ return TokenKind::kAlphaNum;
+ }
+ _state = TokenizerState::kAlphaNum;
+ return TokenKind::kNone;
+ } else if (IsSpace(c)) {
+ // Skip
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kNumAmbiguous:
+ TRACE("kNumAmbiguous %zu", _pos);
+ ASSERT(!IsValidSequenceBreaker(c));
+ if (c == 'b' || c == 'B') {
+ _state = TokenizerState::kNumBinStillNoDigits;
+ return TokenKind::kNone;
+ } else if (c == 'x' || c == 'X') {
+ _state = TokenizerState::kNumHexStillNoDigits;
+ return TokenKind::kNone;
+ } else if (IsOctal(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kNumOct;
+ }
+ _state = TokenizerState::kNumOct;
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kNumHexStillNoDigits:
+ TRACE("kNumHexStillNoDigits %zu", _pos);
+ if (IsHexadecimal(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kNumHex;
+ }
+ _state = TokenizerState::kNumHex;
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kNumHex:
+ TRACE("kNumHex %zu", _pos);
+ ASSERT(!IsValidSequenceBreaker(c));
+ if (IsHexadecimal(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kNumHex;
+ }
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kNumDec:
+ TRACE("kNumDec %zu", _pos);
+ ASSERT(!IsValidSequenceBreaker(c));
+ if (IsDecimal(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kNumDec;
+ }
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kNumOct:
+ TRACE("kNumOct %zu", _pos);
+ ASSERT(!IsValidSequenceBreaker(c));
+ if (IsOctal(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kNumOct;
+ }
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kNumBinStillNoDigits:
+ TRACE("kNumBinStillNoDigits %zu", _pos);
+ if (IsBinary(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kNumBin;
+ }
+ _state = TokenizerState::kNumBin;
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kNumBin:
+ TRACE("kNumBin %zu", _pos);
+ ASSERT(!IsValidSequenceBreaker(c));
+ if (IsBinary(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kNumBin;
+ }
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kAlphaNum:
+ TRACE("kAlphaNum %zu", _pos);
+ ASSERT(!IsValidSequenceBreaker(c));
+ if (IsAlphaNumeric(c)) {
+ if (is_last || IsValidSequenceBreaker(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kAlphaNum;
+ }
+ return TokenKind::kNone;
+ }
+ return errorExpect();
+ case TokenizerState::kComment:
+ TRACE("kComment %zu", _pos);
+ if (is_last || IsNewLine(lookAhead())) {
+ _state = TokenizerState::kFree;
+ return TokenKind::kComment;
+ }
+ return TokenKind::kNone;
+ case TokenizerState::kNewLine:
+ TRACE("kNewLine %zu", _pos);
+ ASSERT(c == '\n');
+ _state = TokenizerState::kFree;
+ return TokenKind::kNewLine;
+ case TokenizerState::kError:
+ TRACE("kError %zu", _pos);
+ UNREACHABLE();
+ return TokenKind::kNone;
+ }
+ UNREACHABLE();
+ return TokenKind::kNone;
+}
+
+enum class ParserState {
+ kAddress = 0,
+ kType,
+ kTableType,
+ kTableTypeCountOrSep,
+ kTableSep,
+ kCountOrName,
+ kName,
+ kEscapeEndOfLine,
+ kEndOfLine,
+ kError,
+};
+
+enum class ParseResult {
+ kOk = 0,
+ kFinished,
+ kError,
+};
+
+struct Statement {
+ uint32_t address_value{};
+ bool has_comment{};
+ TraceNodeKind trace_node_kind{};
+ DataType data_type{};
+ uint32_t size_value{};
+ size_t name_index{};
+ size_t comment_index{};
+};
+
+class Parser {
+ Statement _stmt;
+ FILE *_shstr{};
+ ParserState _state{};
+ ParserState _saved_state{}; // For kEscapeEndOfLine
+ TokenKind _prev_token_kind{};
+ bool _error{};
+ ParseResult handleToken(const Token &);
+ ParseResult errorExpect(const Token &t, const char *expected);
+ ParseResult error(const Token &t, const char *text);
+ size_t addToShstr(const char *str, const size_t len)
+ {
+ if (nullptr == _shstr) {
+ return false;
+ }
+ fflush(_shstr);
+ ASSERT(shstr_size);
+ const size_t index = shstr_size;
+ if (len) {
+ ASSERT(shstr[shstr_size - 1] == '\000'); // Make sure it is sealed
+ const int res = fwrite(str, len, 1, _shstr);
+ ASSERT(res == 1), (void)res;
+ }
+ fflush(_shstr);
+ ASSERT(shstr_size);
+ if (shstr[shstr_size - 1] != '\000') {
+ const int res = fwrite("\000", 1, 1, _shstr);
+ ASSERT(res == 1), (void) res;
+ }
+ return index;
+ }
+ size_t addToShstrRaw(const char *str, const size_t len)
+ {
+ if (nullptr == _shstr) {
+ return false;
+ }
+ fflush(_shstr);
+ ASSERT(shstr_size);
+ const size_t index = shstr_size;
+ if (len) {
+ const int res = fwrite(str, len, 1, _shstr);
+ ASSERT(res == 1), (void) res;
+ }
+ return index;
+ }
+ size_t addNameTokenToShstr(const Token &t)
+ {
+ const char *str = tokenizer.in + t.pos;
+ const size_t len = t.len;
+ ASSERT(len);
+ if (__TRACE) {
+ char *estr = escapeStr(tokenizer.in + t.pos, static_cast<int>(t.len));
+ if (estr) {
+ TRACE("Name token: \"%s\"", estr);
+ free(estr);
+ }
+ }
+ return addToShstr(str, len);
+ }
+ size_t addComment(const Token &t)
+ {
+ const char *str = tokenizer.in + t.pos + 1;
+ const size_t len = t.len - 1;
+ ASSERT(len);
+ if (__TRACE) {
+ char *estr = escapeStr(tokenizer.in + t.pos, static_cast<int>(t.len));
+ if (estr) {
+ TRACE("Comment token: \"%s\"", estr);
+ free(estr);
+ }
+ }
+ return addToShstrRaw(str, len);
+ }
+ void appendCommentNewline(void)
+ {
+ // Newlines are simulated instead of using original ones, because it is
+ // easier to split multiline comment when printing if newlines used are
+ // uniform (only "\n"), when original ones may be mixed, i.e. "\r\n",
+ // "\n" and even "\r" inside a single multiline comment.
+ addToShstrRaw("\n", (sizeof "\n") - 1);
+ }
+ void sealComment(void) { addToShstr(nullptr, 0); }
+ void addTraceNode(Statement &&stmt)
+ {
+ nodes.PushBack(TraceNode{
+ stmt.trace_node_kind,
+ stmt.data_type,
+ stmt.size_value,
+ stmt.address_value,
+ stmt.name_index,
+ stmt.comment_index,
+ });
+ stmt = Statement();
+ }
+public:
+ Tokenizer tokenizer{};
+ Vec<DataType> types;
+ Vec<TraceNode> nodes;
+ char *shstr{};
+ size_t shstr_size{};
+ Parser(const void *input, size_t size, FILE *errstream, const char *filename)
+ : _shstr(open_memstream(&shstr, &shstr_size))
+ , tokenizer(input, size, errstream, filename)
+ {
+ // Write a single zero so any index 0 name or comment will point to it.
+ const int res = fwrite("\000", 1, 1, _shstr);
+ ASSERT(res == 1), (void) res;
+ }
+ ~Parser()
+ {
+ if (_shstr) {
+ fclose(_shstr);
+ _shstr = nullptr;
+ }
+ }
+ ParseResult Parse(void)
+ {
+ ParseResult res{};
+ do {
+ const auto token = tokenizer.Next();
+ TRACE("%s", token.Str());
+ res = handleToken(token);
+ _prev_token_kind = token.kind;
+ } while (ParseResult::kOk == res);
+ if (_shstr) {
+ fclose(_shstr);
+ _shstr = nullptr;
+ }
+ return _error ? ParseResult::kError : res;
+ }
+};
+
+ParseResult Parser::errorExpect(const Token &t, const char *expected)
+{
+ _state = ParserState::kError;
+ _error = true;
+ size_t size{};
+ char *errstr{};
+ FILE *memstream = open_memstream(&errstr, &size);
+ if (nullptr == memstream) {
+ return ParseResult::kOk;
+ }
+ fprintf(
+ memstream,
+ "unexpected %s, expected %s",
+ t.Str(), expected);
+ fclose(memstream);
+ tokenizer.PrintError(t, errstr);
+ free(errstr);
+ return ParseResult::kOk;
+}
+
+ParseResult Parser::error(const Token &t, const char *errstr)
+{
+ _state = ParserState::kError;
+ tokenizer.PrintError(t, errstr);
+ return ParseResult::kOk;
+}
+
+static bool ParseNumeric(const char *str, size_t len, uint32_t &value)
+{
+ // No need to thoroughly validate the number here, because we trust and
+ // heavily rely on the tokenizer getting it right.
+ if (len < 1) {
+ return false;
+ }
+ errno = 0;
+ unsigned long v = 0;
+ if (str[0] == '0' && len > 2 && (str[1] == 'b' || str[1] == 'B')) {
+ v = strtoul(str + 2, nullptr, 2);
+ } else {
+ v = strtoul(str, nullptr, 0);
+ }
+ if (0 == v && errno) {
+ return false;
+ }
+ return value = v, true;
+}
+
+static bool IsEndOfStatement(const TokenKind k)
+{
+ return k == TokenKind::kComment || k == TokenKind::kNewLine || k == TokenKind::kNone;
+}
+
+ParseResult Parser::handleToken(const Token &t)
+{
+ // The trace node description line may be broken in multilpe lines at any
+ // point using the backslash '\' symbol.
+ if (t.kind == TokenKind::kBackslash && _state != ParserState::kEscapeEndOfLine) {
+ _saved_state = _state;
+ _state = ParserState::kEscapeEndOfLine;
+ return ParseResult::kOk;
+ }
+ switch (_state) {
+ case ParserState::kAddress:
+ if (t.kind == TokenKind::kComment) {
+ ASSERT(t.len);
+ if (_stmt.has_comment) {
+ appendCommentNewline();
+ addComment(t);
+ } else {
+ _stmt.comment_index = addComment(t);
+ _stmt.has_comment = true;
+ }
+ return ParseResult::kOk;
+ } if (t.IsNum()) {
+ if (false == ParseNumeric(tokenizer.in + t.pos, t.len, _stmt.address_value)) {
+ return error(t, "number is too big");
+ }
+ sealComment();
+ _state = ParserState::kType;
+ return ParseResult::kOk;
+ } else if (IsEndOfStatement(t.kind)) {
+ ASSERT(t.kind != TokenKind::kComment);
+ if (t.kind == TokenKind::kNone) {
+ sealComment();
+ }
+ if (t.kind == TokenKind::kNewLine && _prev_token_kind == TokenKind::kNewLine) {
+ // An empty line separating comments means that a comment above it
+ // does not belong to any trace node statement, hence we should
+ // reset the heading comment parsing state and seal what has
+ // been parsed so far.
+ _stmt.has_comment = false;
+ // It is easier to just seal the string than undo what has been
+ // written to it so far.
+ sealComment();
+ }
+ return (t.kind == TokenKind::kNone)
+ ? ParseResult::kFinished : ParseResult::kOk;
+ }
+ return errorExpect(t, "any numeric token");
+ case ParserState::kType:
+ if (t.kind == TokenKind::kAlphaNum) {
+ const char *err = ParseTypeFromToken(
+ t, tokenizer.in, _stmt.trace_node_kind, _stmt.data_type);
+ if (err) {
+ return error(t, err);
+ }
+ _state = ParserState::kCountOrName;
+ return ParseResult::kOk;
+ } else if (t.kind == TokenKind::kLBracket) {
+ _stmt.trace_node_kind = TraceNodeKind::kData;
+ _stmt.data_type = DataType{ DataTypeKind::kTable, 0, types.Size() };
+ _state = ParserState::kTableType;
+ return ParseResult::kOk;
+ } else if (IsEndOfStatement(t.kind)) {
+ addTraceNode(static_cast<Statement &&>(_stmt));
+ _state = ParserState::kAddress;
+ return (t.kind == TokenKind::kNone)
+ ? ParseResult::kFinished : ParseResult::kOk;
+ }
+ return errorExpect(t, "alphanumeric token");
+ case ParserState::kTableType:
+ if (t.kind == TokenKind::kAlphaNum) {
+ DataType data_type{};
+ const char *err = ParseTableTypeFromToken(t, tokenizer.in, data_type);
+ if (err) {
+ return error(t, err);
+ }
+ types.PushBack(data_type);
+ _stmt.data_type.nested_num++;
+ _state = ParserState::kTableTypeCountOrSep;
+ return ParseResult::kOk;
+ } else if (_stmt.data_type.nested_num) {
+ // Allow closing bracket after comma, but only when the table
+ // already contains at least a single type, i.e. empty table is not
+ // allowed.
+ if (t.kind == TokenKind::kRBracket) {
+ _state = ParserState::kCountOrName;
+ return ParseResult::kOk;
+ }
+ return errorExpect(t, "`]` or table type alphanumeric token, one of: `ptr`, `u32`, `u16` or `u8`");
+ }
+ return errorExpect(t, "table type alphanumeric token, one of: `ptr`, `u32`, `u16` or `u8`");
+ case ParserState::kTableTypeCountOrSep:
+ if (t.IsNum()) {
+ uint32_t value{};
+ if (false == ParseNumeric(tokenizer.in + t.pos, t.len, value)) {
+ return error(t, "number is too big");
+ }
+ ASSERT(types.Size());
+ types[types.Size() - 1].count = value;
+ _state = ParserState::kTableSep;
+ return ParseResult::kOk;
+ } else if (t.kind == TokenKind::kComma) {
+ _state = ParserState::kTableType;
+ return ParseResult::kOk;
+ } else if (_stmt.data_type.nested_num) {
+ // Allow closing bracket after comma, but only when the table
+ // already contains at least a single type, i.e. empty table is not
+ // allowed.
+ if (t.kind == TokenKind::kRBracket) {
+ _state = ParserState::kCountOrName;
+ return ParseResult::kOk;
+ }
+ return errorExpect(t, "`]`, `,`, number or table type alphanumeric token, one of: `ptr`, `u32`, `u16` or `u8`");
+ }
+ return errorExpect(t, "`,`, number or table type alphanumeric token, one of: `ptr`, `u32`, `u16` or `u8`");
+ case ParserState::kTableSep:
+ if (t.kind == TokenKind::kComma) {
+ _state = ParserState::kTableType;
+ return ParseResult::kOk;
+ } else if (t.kind == TokenKind::kRBracket) {
+ _state = ParserState::kCountOrName;
+ return ParseResult::kOk;
+ }
+ return errorExpect(t, "`,` or `]`");
+ case ParserState::kCountOrName:
+ if (t.IsNum()) {
+ uint32_t value{};
+ if (false == ParseNumeric(tokenizer.in + t.pos, t.len, value)) {
+ return error(t, "number is too big");
+ }
+ if (_stmt.trace_node_kind == TraceNodeKind::kData) {
+ _stmt.data_type.count = value;
+ if (_stmt.data_type.kind == DataTypeKind::kTable) {
+ uint32_t table_entry_size = 0;
+ for (size_t i = 0; i < _stmt.data_type.nested_num; i++) {
+ const auto &type = types[i + _stmt.data_type.nested_idx];
+ ASSERT(type.count);
+ table_entry_size += type.count * type.BaseSize();
+ }
+ _stmt.size_value = value * table_entry_size;
+ } else {
+ _stmt.size_value = _stmt.data_type.count * _stmt.data_type.BaseSize();
+ }
+ } else {
+ _stmt.size_value = value;
+ }
+ _state = ParserState::kName;
+ return ParseResult::kOk;
+ } else if (t.kind == TokenKind::kAlphaNum) {
+ _stmt.name_index = addNameTokenToShstr(t);
+ // Maybe there is something that should be done with the count here
+ // like setting it to 1 in case if it is kTable, but for now let's
+ // just face the fact that it turns out to be zero when count field
+ // is omitted and move on.
+ ASSERT(0 == _stmt.size_value);
+ _state = ParserState::kEndOfLine;
+ return ParseResult::kOk;
+ } else if (IsEndOfStatement(t.kind)) {
+ addTraceNode(static_cast<Statement &&>(_stmt));
+ _state = ParserState::kAddress;
+ return (t.kind == TokenKind::kNone)
+ ? ParseResult::kFinished : ParseResult::kOk;
+ }
+ return errorExpect(t, "any numeric token");
+ case ParserState::kName:
+ if (t.kind == TokenKind::kAlphaNum) {
+ _stmt.name_index = addNameTokenToShstr(t);
+ _state = ParserState::kEndOfLine;
+ return ParseResult::kOk;
+ } else if (IsEndOfStatement(t.kind)) {
+ addTraceNode(static_cast<Statement &&>(_stmt));
+ _state = ParserState::kAddress;
+ return (t.kind == TokenKind::kNone)
+ ? ParseResult::kFinished : ParseResult::kOk;
+ }
+ return errorExpect(t, "alphanumeric token");
+ case ParserState::kEscapeEndOfLine:
+ if (t.kind == TokenKind::kComment) {
+ // Comments are allowed at the end of the line when it supposed to
+ // be escaped with a backslash, but these comments don't get
+ // attached to the trace node, so skip these side comments.
+ return ParseResult::kOk;
+ } else if (IsEndOfStatement(t.kind)) {
+ // Restore the parsing state to continue parsing the trace node
+ // expression.
+ _state = _saved_state;
+ return ParseResult::kOk;
+ }
+ // But backslash may not be inserted anywhere you wish, it is only
+ // allowed before newlines or comments
+ return errorExpect(t, "Comment, EOF, CR, LF or CRLF");
+ case ParserState::kEndOfLine:
+ if (t.kind == TokenKind::kComment) {
+ // Side comments are skipped
+ return ParseResult::kOk;
+ } else if (IsEndOfStatement(t.kind)) {
+ addTraceNode(static_cast<Statement &&>(_stmt));
+ _state = ParserState::kAddress;
+ return (t.kind == TokenKind::kNone)
+ ? ParseResult::kFinished : ParseResult::kOk;
+ }
+ return errorExpect(t, "Comment, EOF, CR, LF or CRLF");
+ case ParserState::kError:
+ // Error recovery: just skip everything to the end of the line, but
+ // honoring the escape symbol '\' (see the top of the function)
+ if (IsEndOfStatement(t.kind)) {
+ _stmt = Statement();
+ _state = ParserState::kAddress;
+ return (t.kind == TokenKind::kNone)
+ ? ParseResult::kFinished : ParseResult::kOk;
+ }
+ return ParseResult::kOk;
+ }
+ UNREACHABLE();
+ return ParseResult::kError;
+}
+
+bool ParseTraceData(
+ TraceTable &output,
+ const void *trace_data,
+ size_t trace_data_size,
+ FILE *errstream,
+ const char *trace_file_name)
+{
+ Parser p(trace_data, trace_data_size, errstream, trace_file_name);
+ const ParseResult res = p.Parse();
+ ASSERT(res != ParseResult::kOk);
+ if (ParseResult::kFinished != res) {
+ return false;
+ }
+ const auto types_count = p.types.Size();
+ const auto nodes_count = p.nodes.Size();
+ const auto shstr_size = p.shstr_size;
+ auto *shstr = p.shstr;
+ if (shstr_size <= 1) {
+ // Nothing has been written to the shstr array, because it contains only
+ // a single char which is a zero, that is written before the parsing
+ // even starts.
+ free(shstr);
+ shstr = nullptr;
+ }
+ output._types = p.types.Extract();
+ output._types_count = types_count;
+ output._nodes = p.nodes.Extract();
+ output._nodes_count = nodes_count;
+ output._shstr = shstr;
+ return true;
+}