From c993531d0678de5e29c943fdbb912e1f20957765 Mon Sep 17 00:00:00 2001 From: Oxore Date: Sun, 3 Mar 2024 18:38:46 +0300 Subject: Impl ELF symbols extraction --- src/main.cpp | 463 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 419 insertions(+), 44 deletions(-) (limited to 'src/main.cpp') diff --git a/src/main.cpp b/src/main.cpp index 89aa2ea..2a9b312 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -27,6 +27,19 @@ #include #include +enum class SymbolType: int { + kNone = 0, + kFunction, + kObject, +}; + +struct Symbol { + uint32_t address{}; + SymbolType type{}; + const char *name{}; + size_t size{}; +}; + enum class DisasmMapType { kTraced, kRaw, @@ -35,15 +48,25 @@ enum class DisasmMapType { class DisasmMap { const DisasmMapType _type; DisasmNode *_map[kDisasmMapSizeElements]{}; + Symbol *_symtab{}; + size_t _symtab_size{}; constexpr DisasmNode *findNodeByAddress(uint32_t address) const; + constexpr size_t findFirstSymbolAtAddress( + uint32_t address, bool return_last_considered=false) const; DisasmNode &insertNode(uint32_t address, NodeType); + void insertSymbol(uint32_t address, ReferenceType ref_type); DisasmNode &insertReferencedBy( const uint32_t by_addr, const uint32_t ref_addr, const NodeType type, const ReferenceType ref_type); constexpr bool canBeAllocated(const DisasmNode& node) const; + constexpr size_t symbolsCount() const { return _symtab_size / sizeof *_symtab; } public: + constexpr const Symbol *Symtab() const { return _symtab; } + constexpr size_t SymbolsCount() const { return symbolsCount(); } + constexpr const char *GetFirstSuitableSymbol(const DisasmNode &, bool is_call) const; + constexpr bool HasSymbolsInRange(uint32_t at, size_t length) const; constexpr const DisasmNode *FindNodeByAddress(uint32_t address) const { return findNodeByAddress(address); @@ -53,6 +76,7 @@ public: assert(_type == DisasmMapType::kTraced); insertNode(address, type); } + bool ApplySymbolsFromElf(const ELF::Image &); void Disasm(const DataView &code, const Settings &, size_t from=0, bool nested=false); DisasmMap(DisasmMapType type): _type(type) {} ~DisasmMap(); @@ -65,12 +89,81 @@ constexpr DisasmNode *DisasmMap::findNodeByAddress(uint32_t address) const return nullptr; } +constexpr size_t DisasmMap::findFirstSymbolAtAddress( + uint32_t address, bool return_last_considered) const +{ + if (_symtab == nullptr || symbolsCount() < 1) { + return 0; + } + // A symbol at index 0 is a special null symbol and it must be skipped. + size_t start = 1, len = symbolsCount() - start, middle = start, index = 0; + while (1) { + if (len == 0) { + if (return_last_considered && index == 0) { + index = start; + } + break; + } + middle = start + len / 2; + if (_symtab[middle].address >= address) { + if (_symtab[middle].address == address) { + index = middle; + } + // Look at the span right before the middle one on the next step + len = middle - start; + } else { + // Look at the span right after the middle one on the next step + len -= middle + 1 - start; + start = middle + 1; + } + } + return index; +} + +static constexpr bool IsWithinRange(uint32_t const value, uint32_t at, size_t length) +{ + return value >= at && value < at + length; +} + +constexpr bool DisasmMap::HasSymbolsInRange( + uint32_t const address, size_t const length) const +{ + size_t index = findFirstSymbolAtAddress(address, true); + if (index == 0) { + // The symtab is empty + return false; + } + if (IsWithinRange(_symtab[index].address, address, length)) { + // The symbol is found right at the address, which is unlikely + return true; + } + if (_symtab[index].address < address) { + // Maybe the next symbol falls into the range? + if (index + 1 >= symbolsCount()) { + // No more symbols after the index + return false; + } + index++; + } else { + // Maybe the previous symbol falls into the range? (unlikely at all) + if (index < 2) { + // No more symbols before the index + return false; + } + index--; + } + if (IsWithinRange(_symtab[index].address, address, length)) { + return true; + } + return false; +} + static constexpr uint32_t AlignInstructionAddress(const uint32_t address) { return address & ~1UL; } -DisasmNode &DisasmMap::insertNode(const uint32_t address, const NodeType type) +DisasmNode &DisasmMap::insertNode(uint32_t address, NodeType type) { auto *node = findNodeByAddress(address); if (node) { @@ -127,6 +220,8 @@ static constexpr ReferenceType ReferenceTypeFromRefKindMask1(const RefKindMask r static constexpr ReferenceType ReferenceTypeFromRefKindMask2(const RefKindMask ref_kinds) { + // FIXME: AFAIK it is impossible for a call instruction to have second + // argument. I can probably drop the first condition, but it needs testing return (ref_kinds & kRefCallMask) ? ReferenceType::kCall : (ref_kinds & kRef2ReadMask) @@ -147,6 +242,76 @@ static constexpr bool IsNextLikelyAnInstruction(const Op &op) op.opcode != OpCode::kSTOP); } +static int cmpsym(const void *p1, const void *p2) +{ + const Symbol *sym1 = reinterpret_cast(p1); + const Symbol *sym2 = reinterpret_cast(p2); + if (sym1->address == sym2->address) { + return strcmp(sym1->name, sym2->name); + } + return sym1->address < sym2->address ? -1 : 1; +} + +constexpr SymbolType SymbolTypeFromElf32SymbolType(const ELF::Symbol32Type &t) +{ + if (t == ELF::Symbol32Type::kObject) { + return SymbolType::kObject; + } + if (t == ELF::Symbol32Type::kFunc) { + return SymbolType::kFunction; + } + return SymbolType::kNone; +} + +bool DisasmMap::ApplySymbolsFromElf(const ELF::Image &elf) +{ + const ELF::SectionHeader32 symtab = elf.GetSectionHeaderByName(".symtab"); + if (!symtab.IsValid()) { + fprintf(stderr, "Warning: \".symtab\" is invalid, skipping symbols\n"); + return true; + } + FILE *symtab_stream = open_memstream(reinterpret_cast(&_symtab), &_symtab_size); + if (symtab_stream == nullptr) { + const int err = errno; + fprintf(stderr, + "open_memstream() for symtab failed: Error (%d): \"%s\"\n", + err, strerror(err)); + return false; + } + const Symbol null_symbol{}; + if (null_symbol.name != nullptr && *null_symbol.name != '\0') { + const size_t ret = fwrite( + &null_symbol, sizeof null_symbol, 1, symtab_stream); + (void) ret; + assert(ret == 1); + } + const size_t nentries = symtab.size/symtab.entsize; + for (size_t i = 0; i < nentries; i++) { + const ELF::Symbol32 elfsym = elf.GetSymbolByIndex(i); + const bool has_proper_type = (elfsym.type() == ELF::Symbol32Type::kNoType) || + (elfsym.type() == ELF::Symbol32Type::kObject) || + (elfsym.type() == ELF::Symbol32Type::kFunc); + if (has_proper_type) { + // XXX: Is it possible that it may have binding other than + // Symbol32Bind::kGlobal when it is kFunc? + // XXX: Yes, it is possible. It may be kLocal or kWeak for sure. + const auto type = SymbolTypeFromElf32SymbolType(elfsym.type()); + const auto symbol = Symbol{elfsym.value, type, elfsym.name, elfsym.size}; + if (symbol.name != nullptr && *symbol.name != '\0') { + const size_t ret = fwrite(&symbol, sizeof symbol, 1, symtab_stream); + (void) ret; + assert(ret == 1); + } + } + } + // No more symbols are going to be added further, so it may be closed now. + fclose(symtab_stream); + // The RenderNodeDisassembly() function expects the symbol table to be + // sorted. + qsort(_symtab, symbolsCount(), sizeof *_symtab, cmpsym); + return true; +} + void DisasmMap::Disasm( const DataView &code, const Settings &s, size_t at, bool nested) { @@ -240,6 +405,9 @@ DisasmMap::~DisasmMap() delete node; i += size - 1; } + if (_symtab != nullptr) { + free(_symtab); + } } static size_t RenderRawDataComment( @@ -312,6 +480,25 @@ static constexpr bool IsLocalLocation(const DisasmMap &disasm_map, const DisasmN { for (const ReferenceNode *ref{node.ref_by}; ref; ref = ref->next) { for (size_t i = 0; i < ref->refs_count; i++) { + // Check symtab, because we may be crossing a symbol + const DisasmNode *ref_node = disasm_map.FindNodeByAddress(ref->refs[i].address); + if (ref_node != nullptr) { + // We won't cross a symbol at the address if the reference is + // backwards ('1b') and we will cross a symbol if the reference + // is forwards ('1f') - that's why we shift the range one + // instruction forward by adding a size to the address and the + // length. + // TODO write tests for it + uint32_t const address = (node.address < ref_node->address) + ? node.address + node.size + : ref_node->address + ref_node->size; + size_t const length = (node.address < ref_node->address) + ? ref_node->address + ref_node->size - (node.address + node.size) + : node.address + node.size - (ref_node->address + ref_node->size); + if (disasm_map.HasSymbolsInRange(address, length)) { + return false; + } + } const ReferenceRecord &ref_rec = ref->refs[i]; if (ref_rec.type == ReferenceType::kCall) { // Locals are definitely not made for calls @@ -356,47 +543,183 @@ static constexpr const char *StringWihoutFristNChars(const char *str, const size return str; } +constexpr const char *DisasmMap::GetFirstSuitableSymbol( + const DisasmNode &node, bool is_call) const +{ + const size_t index = findFirstSymbolAtAddress(node.address); + if (index == 0) { + return nullptr; + } + if (!is_call) { + return _symtab[index].name; + } + for (size_t i = index; i < symbolsCount() && _symtab[i].address == node.address; i++) { + if (_symtab[i].type == SymbolType::kFunction) { + return _symtab[i].name; + } + } + return nullptr; +} + +struct PendingObjectSize { + PendingObjectSize *next{}; + uint32_t at{}; + const char *name{}; +}; + +struct PendingObjectSizeList { + PendingObjectSize *_first{}, *_last{}; + void Add(uint32_t at, const char *name) + { + assert(name && *name); + // Last in first out + PendingObjectSize *pending = new PendingObjectSize{_first, at, name}; + assert(pending); + if (_last == nullptr) { + _last = pending; + } + _first = pending; + } + const char *TakeNext(uint32_t at) + { + for (PendingObjectSize *cur = _first, *prev = nullptr; cur;) { + // Last in first out + if (cur->at == at) { + const char *name = cur->name; + if (prev) { + prev->next = cur->next; + } else { + _first = cur->next; + } + if (_last == cur) { + _last = prev; + } + delete cur; + return name; + } + prev = cur; + cur = cur->next; + } + return nullptr; + } + ~PendingObjectSizeList() + { + while (_first) { + auto *cur = _first; + _first = _first->next; + delete cur; + } + _last = nullptr; + } +}; + +static constexpr const char *SymbolTypeToElfTypeString(SymbolType t) +{ + switch (t) { + case SymbolType::kNone: return nullptr; + case SymbolType::kFunction: return "function"; + case SymbolType::kObject: return "object"; + } + return nullptr; +} + static void RenderNodeDisassembly( FILE *const output, const DisasmMap &disasm_map, const DataView &code, const Settings &s, - const DisasmNode &node) + const DisasmNode &node, + size_t &symbol_index, + PendingObjectSizeList &pending_size) { - if (node.ref_by) { - const bool is_local = IsLocalLocation(disasm_map, node); - if (s.labels && !(s.short_ref_local_labels && is_local)) { - const bool export_this_function = s.export_functions && HasCallReference(node); + for (const char *name = pending_size.TakeNext(node.address); name;) { + fprintf(output, "%s.size\t%s,.-%s\n", s.indent, name, name); + name = pending_size.TakeNext(node.address); + } + const size_t symtab_size = disasm_map.SymbolsCount(); + bool have_rendered_label_already = false; + bool have_rendered_function_label_already = false; + if (disasm_map.Symtab() != nullptr && symtab_size > 0) { + for (; symbol_index < symtab_size; symbol_index++) { + if (disasm_map.Symtab()[symbol_index].address >= node.address) { + break; + } + } + for (; symbol_index < symtab_size; symbol_index++) { + const auto &symbol = disasm_map.Symtab()[symbol_index]; + if (symbol.address != node.address) { + break; + } + if (symbol.name != nullptr || *symbol.name == '\0') { + fprintf(output, "\n%s.globl\t%s\n", s.indent, symbol.name); + if (symbol.type == SymbolType::kFunction) { + have_rendered_function_label_already = true; + } + const char *const type = SymbolTypeToElfTypeString(symbol.type); + if (type) { + fprintf(output, "%s.type\t%s, @%s\n", s.indent, symbol.name, type); + } + if (symbol.size > 0) { + pending_size.Add(node.address + symbol.size, symbol.name); + } + fprintf(output, "%s:\n", disasm_map.Symtab()[symbol_index].name); + have_rendered_label_already = true; + } + } + } + const bool is_local = s.short_ref_local_labels && IsLocalLocation(disasm_map, node); + do { + // Skip generating label or short jump label in-place in case if there + // are no referrers or we already have a suitable label from ELF's + // symtab or some other sources, that has been printed in the code + // section above. + if (node.ref_by == nullptr) { + break; + } + const bool have_call_reference = HasCallReference(node); + if (have_call_reference && have_rendered_function_label_already) { + break; + } + if (have_rendered_label_already) { + break; + } + // If we got here it must be that there is no suitable symbol found in + // the symtab, so it must be generated in-place. + constexpr auto generated_name_length = sizeof "L00000000"; + char name[generated_name_length + 1] = {0}; + snprintf(name, generated_name_length, "L%08x", node.address); + if (s.labels && !is_local) { + const bool export_this_function = s.export_functions && have_call_reference; const bool export_this_label = s.export_all_labels || (s.export_labels && node.ref_by && (node.ref_by->refs_count > 1)) || export_this_function; if (export_this_label) { - fprintf(output, "\n%s.globl\tL%08x\n", s.indent, node.address); + fprintf(output, "\n%s.globl\t%s\n", s.indent, name); if (export_this_function) { - fprintf(output, "%s.type\tL%08x, @function\n", s.indent, node.address); + fprintf(output, "%s.type\t%s, @function\n", s.indent, name); } } } - if (s.xrefs_from && !(s.short_ref_local_labels && is_local)) { - fprintf(output, "| XREFS:\n"); - for (const ReferenceNode *ref{node.ref_by}; ref; ref = ref->next) { - if (ref->refs_count == 0) { - continue; - } - fprintf(output, "|"); - for (size_t i = 0; i < ref->refs_count; i++) { - const ReferenceRecord r = ref->refs[i]; - fprintf(output, " %s @%08x", ReferenceTypeToString(r.type), r.address); - } - fprintf(output, "\n"); - } - } if (s.labels) { - if (s.short_ref_local_labels && is_local) { + if (is_local) { fprintf(output, "1:%s", StringWihoutFristNChars(s.indent, (sizeof "1:") - 1)); } else { - fprintf(output, "L%08x:\n", node.address); + fprintf(output, "%s:\n", name); + } + } + } while (0); + if (s.xrefs_from && !(is_local && !have_rendered_label_already)) { + fprintf(output, "| XREFS:\n"); + for (const ReferenceNode *ref{node.ref_by}; ref; ref = ref->next) { + if (ref->refs_count == 0) { + continue; + } + fprintf(output, "|"); + for (size_t i = 0; i < ref->refs_count; i++) { + const ReferenceRecord r = ref->refs[i]; + fprintf(output, " %s @%08x", ReferenceTypeToString(r.type), r.address); } + fprintf(output, "\n"); } } assert(node.op.opcode != OpCode::kNone); @@ -430,20 +753,32 @@ static void RenderNodeDisassembly( : 0) | ((s.imm_labels && ref1) ? (node.ref_kinds & kRef1ImmMask) : 0) | (node.ref_kinds & (kRefDataMask | kRefPcRelFix2Bytes)); - const bool ref1_is_local = !ref1 || IsLocalLocation(disasm_map, *ref1); + const bool ref1_is_local = s.short_ref_local_labels && + ref1 && IsLocalLocation(disasm_map, *ref1); char ref1_label[32]{}; if (ref1) { - if (s.short_ref_local_labels && ref1_is_local) { + const bool is_call = + ReferenceType::kCall == ReferenceTypeFromRefKindMask1(ref_kinds); + const char *sym_name = disasm_map.GetFirstSuitableSymbol(*ref1, is_call); + if (sym_name) { + snprintf(ref1_label, (sizeof ref1_label), "%s", sym_name); + } else if (ref1_is_local) { const char dir = ref1_addr <= node.address ? 'b' : 'f'; snprintf(ref1_label, (sizeof ref1_label), "1%c", dir); } else { - snprintf(ref1_label, (sizeof ref1_label), "L%08x", ref1_addr); + snprintf(ref1_label, (sizeof ref1_label), "L%08x", ref1_addr); } } - const bool ref2_is_local = !ref2 || IsLocalLocation(disasm_map, *ref2); + const bool ref2_is_local = s.short_ref_local_labels && + ref2 && IsLocalLocation(disasm_map, *ref2); char ref2_label[32]{}; if (ref2) { - if (s.short_ref_local_labels && ref2_is_local) { + const bool is_call = + ReferenceType::kCall == ReferenceTypeFromRefKindMask2(ref_kinds); + const char *sym_name = disasm_map.GetFirstSuitableSymbol(*ref2, is_call); + if (sym_name) { + snprintf(ref2_label, (sizeof ref2_label), "%s", sym_name); + } else if (ref2_is_local) { const char dir = ref2_addr <= node.address ? 'b' : 'f'; snprintf(ref2_label, (sizeof ref2_label), "1%c", dir); } else { @@ -461,12 +796,11 @@ static void RenderNodeDisassembly( ref1_addr, ref2_addr); const bool ref1_from_imm_ok = ((node.ref_kinds & kRef1ImmMask) ? s.imm_labels : true); - if (s.xrefs_to && !(s.short_ref_local_labels && ref1_is_local) && ref1_from_imm_ok) - { - fprintf(output, " | L%08x", ref1_addr); + if (s.xrefs_to && ref1 && !ref1_is_local && ref1_from_imm_ok) { + fprintf(output, " | XREF1 @%08x", ref1_addr); } - if (s.xrefs_to && !(s.short_ref_local_labels && ref2_is_local)) { - fprintf(output, " | L%08x", ref2_addr); + if (s.xrefs_to && ref2 && !ref2_is_local) { + fprintf(output, " | XREF2 @%08x", ref2_addr); } } else { node.op.FPrint(output, s.indent, s.imm_hex); @@ -484,21 +818,54 @@ static void RenderNodeDisassembly( fprintf(output, "\n"); } +static void RenderNonCodeSymbols( + FILE *const output, const DisasmMap &disasm_map, const DataView &code, const Settings &s) +{ + const size_t symtab_size = disasm_map.SymbolsCount(); + for (size_t i = 0; i < symtab_size; i++) { + const auto &symbol = disasm_map.Symtab()[i]; + if (symbol.address <= code.size) { + continue; + } + fprintf(output, "\n%s.globl\t%s\n", s.indent, symbol.name); + const char *const type = SymbolTypeToElfTypeString(symbol.type); + if (type) { + fprintf(output, "%s.type\t%s, @%s\n", s.indent, symbol.name, type); + } + fprintf(output, "%s = 0x%08x\n", symbol.name, symbol.address); + if (symbol.size) { + fprintf(output, "%s.size\t%s, 0x%zx\n", s.indent, symbol.name, symbol.size); + } + } +} + static void RenderDisassembly( FILE *const output, const DisasmMap &disasm_map, const DataView &code, const Settings &s) { - for (size_t i = 0; i < code.size;) { + // This list is used to track all places where ".size fnname, .-fnname" + // directives must be put. + PendingObjectSizeList pending_size{}; + // sym_i starts with 1 because 0 is a special null symbol + for (size_t i = 0, sym_i = 1; i < code.size;) { + const DisasmNode raw = DisasmNode{ + /* .type = */ NodeType::kTracedInstruction, + /* .address = */ static_cast(i), + /* .size = */ 2, + /* .ref_kinds = */ 0, + /* .ref1_addr = */ 0, + /* .ref2_addr = */ 0, + /* .ref_by = */ nullptr, + /* .last_ref_by = */ nullptr, + /* .op = */ Op::Raw(GetU16BE(code.buffer + i)), + }; const DisasmNode *node = disasm_map.FindNodeByAddress(i); - if (node) { - RenderNodeDisassembly(output, disasm_map, code, s, *node); - i += node->size; - } else { - auto raw = Op::Raw(GetU16BE(code.buffer + i)); - raw.FPrint(output, s.indent, s.imm_hex); - fprintf(output, "\n"); - i += kInstructionSizeStepBytes; + if (node == nullptr) { + node = &raw; } + RenderNodeDisassembly(output, disasm_map, code, s, *node, sym_i, pending_size); + i += node->size; } + RenderNonCodeSymbols(output, disasm_map, code, s); } static void ParseTraceData(DisasmMap &disasm_map, const DataView &trace_data) @@ -580,7 +947,7 @@ static DisasmMap *NewDisasmMap(FILE *trace_stream) } // Parse trace file into map DisasmMap *disasm_map = new DisasmMap{DisasmMapType::kTraced}; - assert(disasm_map); + assert(disasm_map != nullptr); ParseTraceData(*disasm_map, trace_data.View()); return disasm_map; } @@ -614,6 +981,11 @@ static int M68kDisasm( if (disasm_map == nullptr) { return EXIT_FAILURE; } + if (from_elf && s.symbols) { + if (false == disasm_map->ApplySymbolsFromElf(elf)) { + return EXIT_FAILURE; + } + } // Disasm into output map disasm_map->Disasm(code, s); // Print output into output_stream @@ -652,6 +1024,7 @@ static bool ApplyFeature(Settings& s, const char *feature_arg) { &Settings::imm_hex, "imm-hex" }, { &Settings::follow_jumps, "follow-jumps" }, { &Settings::walk, "walk" }, + { &Settings::symbols, "symbols" }, }; constexpr size_t sizeof_no_prefix = (sizeof "no-") - 1; const bool disable = FeatureStringHasPrefixNo(feature_arg); @@ -708,6 +1081,8 @@ static void PrintUsage(FILE *s, const char *argv0) " follow-jumps Follow jumps to statically known locations.\n" " walk Try best to detect further instructions following known\n" " traced locations without overcommitting.\n" + " symbols Extract and apply symbols from input file if available.\n" + " ELF symbols only are currently supported.\n" , argv0); } -- cgit v1.2.3