diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 07277b8f..610e1f8b 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -34,6 +34,7 @@ set(SHARED_RUNTIME_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/RuntimeCommon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/LibcWrappers.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Shadow.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/Tracer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/GarbageCollection.cpp) if (${QSYM_BACKEND}) diff --git a/runtime/GarbageCollection.cpp b/runtime/GarbageCollection.cpp index 8afdd327..01c9be7f 100644 --- a/runtime/GarbageCollection.cpp +++ b/runtime/GarbageCollection.cpp @@ -23,6 +23,10 @@ /// A list of memory regions that are known to contain symbolic expressions. std::vector expressionRegions; +std::vector& getExpressionRegions() { + return expressionRegions; +} + void registerExpressionRegion(ExpressionRegion r) { expressionRegions.push_back(std::move(r)); } diff --git a/runtime/GarbageCollection.h b/runtime/GarbageCollection.h index 81b0b8c2..0a118ed4 100644 --- a/runtime/GarbageCollection.h +++ b/runtime/GarbageCollection.h @@ -28,6 +28,7 @@ using ExpressionRegion = std::pair; /// Add the specified region to the list of places to search for symbolic /// expressions. void registerExpressionRegion(ExpressionRegion r); +std::vector& getExpressionRegions(); /// Return the set of currently reachable symbolic expressions. std::set collectReachableExpressions(); diff --git a/runtime/RuntimeCommon.cpp b/runtime/RuntimeCommon.cpp index 127c81de..6d6da672 100644 --- a/runtime/RuntimeCommon.cpp +++ b/runtime/RuntimeCommon.cpp @@ -27,6 +27,7 @@ #include "GarbageCollection.h" #include "RuntimeCommon.h" #include "Shadow.h" +#include "Tracer.h" namespace { @@ -486,3 +487,10 @@ SymExpr _sym_build_bit_to_bool(SymExpr expr) { return _sym_build_not_equal(expr, _sym_build_integer(0, _sym_bits_helper(expr))); } + +void _sym_trace_execution(uintptr_t pc) { Tracer::traceStep(pc); +} + +void _sym_finalize_tracing() { + Tracer::writeTraceToDisk(); +} \ No newline at end of file diff --git a/runtime/RuntimeCommon.h b/runtime/RuntimeCommon.h index 4c05ceb0..8218a694 100644 --- a/runtime/RuntimeCommon.h +++ b/runtime/RuntimeCommon.h @@ -211,6 +211,8 @@ void _sym_notify_basic_block(uintptr_t site_id); */ const char *_sym_expr_to_string(SymExpr expr); // statically allocated bool _sym_feasible(SymExpr expr); +void _sym_trace_execution(uintptr_t pc); +void _sym_finalize_tracing(void); /* * Garbage collection diff --git a/runtime/Tracer.cpp b/runtime/Tracer.cpp new file mode 100644 index 00000000..0ba52c5a --- /dev/null +++ b/runtime/Tracer.cpp @@ -0,0 +1,113 @@ + +#include "Tracer.h" +#include "Shadow.h" +#include "llvm/Support/raw_ostream.h" + +nlohmann::json Tracer::currentTrace; +nlohmann::json Tracer::expressions; +nlohmann::json Tracer::pathConstraints; +const std::string Tracer::BACKEND_TRACE_FILE = "/tmp/backend_trace.json"; + +void Tracer::traceStep(uintptr_t pc) { + + nlohmann::json newEntry; + newEntry["pc"] = pc; + newEntry["memory_to_expression_mapping"] = nlohmann::json::object(); + + /* dump shadow pages */ + for (auto const &[pageAddress, _] : g_shadow_pages) { + for (auto byteAddress = pageAddress; byteAddress < pageAddress + kPageSize; + byteAddress++) { + auto byteExpr = _sym_read_memory((u_int8_t *)byteAddress, 1, true); + if (byteExpr != nullptr && !byteExpr->isConcrete()) { + + newEntry["memory_to_expression_mapping"][std::to_string(reinterpret_cast(byteAddress))] = + getExpressionID(byteExpr); + } + } + } + + /* dump regions that the client has registered with _sym_register_expression_region + * for SymQEMU, this is env_exprs which contains the expressions for the global TCG variables (i.e., guest registers) */ + for (auto &expressionRegion : getExpressionRegions()){ + for (auto byteAddress = expressionRegion.first; byteAddress < expressionRegion.first + expressionRegion.second / sizeof(byteAddress); + byteAddress++) { + auto byteExpr = *byteAddress; + if (byteExpr != nullptr && !byteExpr->isConcrete()) { + + newEntry["memory_to_expression_mapping"][std::to_string(reinterpret_cast(byteAddress))] = + getExpressionID(byteExpr); + } + } + } + + currentTrace.push_back(newEntry); +} + + +void Tracer::tracePathConstraint(SymExpr constraint, bool taken) { + if (pathConstraints.empty()) { + symcc_set_test_case_handler( + reinterpret_cast(traceNewInput)); + } + + nlohmann::json newEntry; + newEntry["expression"] = getExpressionID(constraint); + newEntry["after_step"] = currentTrace.size() - 1; + newEntry["new_input_value"] = nlohmann::json(); + newEntry["taken"] = taken; + + pathConstraints.push_back(newEntry); +} + +void Tracer::traceNewInput(const unsigned char *input, size_t size) { + for (size_t i = 0; i < size; i++) { + pathConstraints[pathConstraints.size() - 1]["new_input_value"].push_back( + input[i]); + } +} + +void Tracer::writeTraceToDisk() { + for (auto const &[_, expressionPtr] : getAllocatedExpressions()) { + recursivelyCollectExpressions(expressionPtr); + } + + nlohmann::json dataToSave; + dataToSave["trace"] = currentTrace; + dataToSave["expressions"] = expressions; + dataToSave["path_constraints"] = pathConstraints; + + std::ofstream o(BACKEND_TRACE_FILE); + o << std::setw(4) << dataToSave << std::endl; +} + +void Tracer::recursivelyCollectExpressions(const shared_ptr&expressionPtr) { + string expressionID = getExpressionID(expressionPtr.get()); + if (expressions.count(expressionID) > 0) { + return; + } + + expressions[expressionID]["operation"]["kind"] = expressionPtr->kind(); + expressions[expressionID]["operation"]["properties"] = nlohmann::json::object(); + if (expressionPtr->kind() == qsym::Constant){ + auto value_llvm_int = static_pointer_cast(expressionPtr)->value(); + std::string value_str; + llvm::raw_string_ostream rso(value_str); + value_llvm_int.print(rso, false); + + expressions[expressionID]["operation"]["properties"]["value"] = value_str; + } + expressions[expressionID]["size_bits"] = expressionPtr->bits(); + expressions[expressionID]["input_byte_dependency"] = *expressionPtr->getDependencies(); + expressions[expressionID]["args"] = nlohmann::json::array(); + for (int child_i = 0; child_i < expressionPtr->num_children(); child_i++) { + shared_ptr child = expressionPtr->getChild(child_i); + string childID = getExpressionID(child.get()); + expressions[expressionID]["args"].push_back(childID); + recursivelyCollectExpressions(child); + } +} + +string Tracer::getExpressionID(SymExpr expression) { + return std::to_string(reinterpret_cast(expression)); +} diff --git a/runtime/Tracer.h b/runtime/Tracer.h new file mode 100644 index 00000000..fd1cb448 --- /dev/null +++ b/runtime/Tracer.h @@ -0,0 +1,58 @@ + +#ifndef SYMRUNTIME_TRACER_H +#define SYMRUNTIME_TRACER_H + +#include "GarbageCollection.h" +#include "Runtime.h" +#include +#include +#include + +/** + * Generates a trace of the symbolic execution. + * + * A trace contains a list of symbolic memory snapshots, a list of path constraints and a list of expressions. + * The trace is written to a json file at the end of the execution. + */ +class Tracer { +public: + /** + * Adds a dump of the current symbolic state to the trace. + */ + static void traceStep(uintptr_t pc); + + /** + * Adds a path constraint to the trace. + */ + static void tracePathConstraint(SymExpr constraint, bool taken); + + /** + * Adds an input generated by the symbolic backend. + * The input is associated to the last path constraint. + */ + static void traceNewInput(const unsigned char* input, size_t size); + + /** + * Generates a json file containing the trace. + * Must be called before the program exits. + */ + static void writeTraceToDisk(); + +private: + /** + * Collects all expressions reachable from the given expression. + */ + static void recursivelyCollectExpressions(const shared_ptr&expressionPtr); + + /** + * Generates a string ID from the address of the given expression. + */ + static string getExpressionID(SymExpr expression); + + static nlohmann::json currentTrace; + static nlohmann::json expressions; + static nlohmann::json pathConstraints; + static const std::string BACKEND_TRACE_FILE; +}; + +#endif // SYMRUNTIME_TRACER_H diff --git a/runtime/qsym_backend/CMakeLists.txt b/runtime/qsym_backend/CMakeLists.txt index 34c01320..d5d76744 100644 --- a/runtime/qsym_backend/CMakeLists.txt +++ b/runtime/qsym_backend/CMakeLists.txt @@ -83,7 +83,13 @@ target_include_directories(SymRuntime SYSTEM PRIVATE # We need to get the LLVM support component for llvm::APInt. llvm_map_components_to_libnames(QSYM_LLVM_DEPS support) -target_link_libraries(SymRuntime ${Z3_LIBRARIES} ${QSYM_LLVM_DEPS}) +include(FetchContent) + +FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz) +FetchContent_MakeAvailable(json) + + +target_link_libraries(SymRuntime ${Z3_LIBRARIES} ${QSYM_LLVM_DEPS} nlohmann_json::nlohmann_json) # We use std::filesystem, which has been added in C++17. Before its official # inclusion in the standard library, Clang shipped the feature first in diff --git a/runtime/qsym_backend/Runtime.cpp b/runtime/qsym_backend/Runtime.cpp index 650787bf..3f50b28c 100644 --- a/runtime/qsym_backend/Runtime.cpp +++ b/runtime/qsym_backend/Runtime.cpp @@ -19,6 +19,7 @@ #include "Runtime.h" #include "GarbageCollection.h" +#include "Tracer.h" // C++ #if __has_include() @@ -154,6 +155,10 @@ EnhancedQsymSolver *g_enhanced_solver; } // namespace +std::map &getAllocatedExpressions() { + return allocatedExpressions; +} + using namespace qsym; #if HAVE_FILESYSTEM @@ -321,6 +326,7 @@ void _sym_push_path_constraint(SymExpr constraint, int taken, if (constraint == nullptr) return; + Tracer::tracePathConstraint(constraint, taken != 0); g_solver->addJcc(allocatedExpressions.at(constraint), taken != 0, site_id); } @@ -449,7 +455,7 @@ bool _sym_feasible(SymExpr expr) { // void _sym_collect_garbage() { - if (allocatedExpressions.size() < g_config.garbageCollectionThreshold) +// if (allocatedExpressions.size() < g_config.garbageCollectionThreshold) return; #ifdef DEBUG_RUNTIME diff --git a/runtime/qsym_backend/Runtime.h b/runtime/qsym_backend/Runtime.h index 8f19d2af..27a15aa2 100644 --- a/runtime/qsym_backend/Runtime.h +++ b/runtime/qsym_backend/Runtime.h @@ -21,4 +21,6 @@ typedef qsym::Expr *SymExpr; #include +std::map &getAllocatedExpressions(); + #endif diff --git a/util/symbolic_trace/.gitignore b/util/symbolic_trace/.gitignore new file mode 100644 index 00000000..e5a60ce4 --- /dev/null +++ b/util/symbolic_trace/.gitignore @@ -0,0 +1,4 @@ +build +symcctrace.egg-info +.idea +__pycache__ \ No newline at end of file diff --git a/util/symbolic_trace/README.md b/util/symbolic_trace/README.md new file mode 100644 index 00000000..6f03e33f --- /dev/null +++ b/util/symbolic_trace/README.md @@ -0,0 +1,7 @@ +This is a python package for parsing a trace generated by the tracing feature of SymCC. + +Install it with + +``` +pip install . +``` \ No newline at end of file diff --git a/util/symbolic_trace/pyproject.toml b/util/symbolic_trace/pyproject.toml new file mode 100644 index 00000000..1551312a --- /dev/null +++ b/util/symbolic_trace/pyproject.toml @@ -0,0 +1,3 @@ +[project] +name = "symcctrace" +version = "1.0" diff --git a/util/symbolic_trace/symcctrace/__init__.py b/util/symbolic_trace/symcctrace/__init__.py new file mode 100644 index 00000000..6b9d7eef --- /dev/null +++ b/util/symbolic_trace/symcctrace/__init__.py @@ -0,0 +1,2 @@ +from .parsing import parse_trace +from .data import TraceData diff --git a/util/symbolic_trace/symcctrace/data.py b/util/symbolic_trace/symcctrace/data.py new file mode 100644 index 00000000..a965c951 --- /dev/null +++ b/util/symbolic_trace/symcctrace/data.py @@ -0,0 +1,115 @@ +import enum +import dataclasses + + +class ExpressionKind(enum.Enum): + BOOL = 0 + CONSTANT = 1 + READ = 2 + CONCAT = 3 + EXTRACT = 4 + ZEXT = 5 + SEXT = 6 + ADD = 7 + SUB = 8 + MUL = 9 + UDIV = 10 + SDIV = 11 + UREM = 12 + SREM = 13 + NEG = 14 + NOT = 15 + AND = 16 + OR = 17 + XOR = 18 + SHL = 19 + LSHR = 20 + ASHR = 21 + EQUAL = 22 + DISTINCT = 23 + ULT = 24 + ULE = 25 + UGT = 26 + UGE = 27 + SLT = 28 + SLE = 29 + SGT = 30 + SGE = 31 + LOR = 32 + LAND = 33 + LNOT = 34 + ITE = 35 + ROL = 36 + ROR = 37 + INVALID = 38 + + +@dataclasses.dataclass +class Operation: + kind: ExpressionKind + properties: dict + + +@dataclasses.dataclass +class RawExpression: + operation: Operation + size_bits: int + input_byte_dependency: list[int] + args: list[str] + + +@dataclasses.dataclass +class RawTraceStep: + pc: int + memory_to_expression_mapping: dict[str, str] + + +@dataclasses.dataclass +class RawPathConstraint: + expression: str + after_step: int + new_input_value: list[int] | None + taken: bool + + +@dataclasses.dataclass +class RawTraceData: + trace: list[RawTraceStep] + expressions: dict[str, RawExpression] + path_constraints: list[RawPathConstraint] + + +@dataclasses.dataclass +class Expression: + operation: Operation + size_bits: int + input_byte_dependency: list[int] + args: list['Expression'] + + +@dataclasses.dataclass +class TraceStep: + pc: int + memory_to_expression_mapping: dict[str, Expression] + + +@dataclasses.dataclass +class PathConstraint: + expression: Expression + after_step: TraceStep + new_input_value: bytes | None + taken: bool + + +@dataclasses.dataclass +class MemoryArea: + address: int + name: str + + +@dataclasses.dataclass +class TraceData: + trace: list[TraceStep] + expressions: dict[str, Expression] + path_constraints: list[PathConstraint] + memory_areas: list[MemoryArea] diff --git a/util/symbolic_trace/symcctrace/parsing.py b/util/symbolic_trace/symcctrace/parsing.py new file mode 100644 index 00000000..c86396f1 --- /dev/null +++ b/util/symbolic_trace/symcctrace/parsing.py @@ -0,0 +1,128 @@ +import functools +import itertools +import json +import math +import pathlib + +import cattrs + +from . import data + +MEMORY_AREA_MAX_DISTANCE = 0x1000000 + + +def parse_trace( + trace_file: pathlib.Path, + memory_region_names_file: pathlib.Path, +) -> data.TraceData: + """Parses a trace generated by the tracing feature of SymCC. + + Args + trace_file: The path to the trace file generated by SymCC. + memory_region_names_file: The path to a file containing a mapping from memory region names to their addresses. + """ + with open(trace_file) as file: + raw_trace_data = cattrs.Converter(forbid_extra_keys=True).structure(json.load(file), data.RawTraceData) + + with open(memory_region_names_file) as file: + def convert_to_memory_area(raw_memory_area: dict[str, str]) -> data.MemoryArea: + return cattrs.Converter(forbid_extra_keys=True).structure(raw_memory_area, data.MemoryArea) + + memory_areas = list(map(convert_to_memory_area, json.load(file))) + + expressions = _convert_expressions(raw_trace_data.expressions) + trace_steps = list( + map(functools.partial(_convert_trace_step, expressions=expressions, memory_areas=memory_areas), + raw_trace_data.trace)) + path_constraints = list( + map(functools.partial(_convert_path_constraint, expressions=expressions, trace_steps=trace_steps), + raw_trace_data.path_constraints)) + + trace_data = data.TraceData( + trace=trace_steps, + expressions=expressions, + path_constraints=path_constraints, + memory_areas=memory_areas + ) + + return trace_data + + +def _convert_operation(raw_operation: data.Operation, size_bits: int) -> data.Operation: + operation = data.Operation( + kind=data.ExpressionKind(raw_operation.kind), + properties=raw_operation.properties + ) + + if 'value' in operation.properties: + operation.properties['value'] = int(operation.properties['value']).to_bytes(math.ceil(size_bits / 8), 'little') + + return operation + + +def _convert_expressions(raw_expressions: dict[str, data.RawExpression]) -> dict[str, data.Expression]: + expressions = {} + + def recursively_create_expression(expression_id: str): + if expression_id in expressions: + return expressions[expression_id] + + raw_expression = raw_expressions[expression_id] + args = [recursively_create_expression(arg) for arg in raw_expression.args] + + expression = data.Expression( + operation=_convert_operation(raw_expression.operation, raw_expression.size_bits), + args=args, + input_byte_dependency=raw_expression.input_byte_dependency, + size_bits=raw_expression.size_bits + ) + expressions[expression_id] = expression + return expression + + for expression_id in raw_expressions: + recursively_create_expression(expression_id) + + return expressions + + +def _raw_memory_address_to_named_location(raw_memory_address: int, memory_areas: list[data.MemoryArea]) -> str: + def distance(memory_area: data.MemoryArea) -> int: + return raw_memory_address - memory_area.address + + def is_candidate(memory_area: data.MemoryArea) -> bool: + return \ + abs(distance(memory_area)) < MEMORY_AREA_MAX_DISTANCE \ + if memory_area.name == 'stack' \ + else 0 <= distance(memory_area) < MEMORY_AREA_MAX_DISTANCE + + def absolute_distance(memory_area: data.MemoryArea) -> int: + return abs(distance(memory_area)) + + closest_memory_area = min(filter(is_candidate, memory_areas), key=absolute_distance) + + return f'{closest_memory_area.name}+{hex(distance(closest_memory_area))}' + + +def _convert_trace_step(raw_trace_step: data.RawTraceStep, expressions: dict[str, data.Expression], + memory_areas: list[data.MemoryArea]) -> data.TraceStep: + def convert_mapping(mapping: dict[str, str]) -> dict[str, data.Expression]: + def convert_mapping_element(raw_address: str, expression_id: str) -> tuple[str, data.Expression]: + return _raw_memory_address_to_named_location(int(raw_address), memory_areas), expressions[expression_id] + + return dict(itertools.starmap(convert_mapping_element, mapping.items())) + + return data.TraceStep( + pc=raw_trace_step.pc, + memory_to_expression_mapping=convert_mapping(raw_trace_step.memory_to_expression_mapping) + ) + + +def _convert_path_constraint(raw_path_constraint: data.RawPathConstraint, expressions: dict[str, data.Expression], + trace_steps: list[data.TraceStep]) -> data.PathConstraint: + return data.PathConstraint( + expression=expressions[raw_path_constraint.expression], + after_step=trace_steps[raw_path_constraint.after_step], + new_input_value=bytes( + raw_path_constraint.new_input_value) if raw_path_constraint.new_input_value is not None else None, + taken=raw_path_constraint.taken + )