Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
add_subdirectory("Examples")
add_subdirectory("GenJSON")
add_subdirectory("Index")
add_subdirectory("Query")
188 changes: 188 additions & 0 deletions bin/Index/BuildAST.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
// Copyright (c) 2022-present, Trail of Bits, Inc.
// All rights reserved.
//
// This source code is licensed in accordance with the terms specified in
// the LICENSE file found in the root directory of this source tree.

#include "BuildAST.h"

#include <algorithm>
#include <cassert>
#include <fstream>
#include <glog/logging.h>
#include <iostream>
#include <tuple>
#include <unordered_map>
#include <vector>
#include <multiplier/NodeKind.h>

namespace indexer {

static void SerializeAST(mx::Fragment fragment, ServerContext &ctx) {
auto &ast = ctx->ast;
std::unordered_map<unsigned, std::uint64_t> ctx_to_node_id;

for (mx::Token tok : mx::Token::in(fragment)) {
// Skip whitespaces
switch (tok.kind()) {
case mx::TokenKind::UNKNOWN:
case mx::TokenKind::WHITESPACE:
case mx::TokenKind::COMMENT:
continue;
default:
if (tok.data().empty()) {
continue;
}
break;
}
ctx->spelling_to_token_kind.Set(tok.data(), tok.kind());

// Start with the token node
mx::ASTNode node{};
node.kind = mx::syntex::NodeKind{tok.kind()}.Serialize();
node.entity = tok.id();
node.spelling = std::string(tok.data().data(), tok.data().size());
node.prev = ast.GetNodeInIndex(fragment.id(), node.kind);
std::optional<std::uint64_t> node_id = ast.AddNode(node);
ast.SetNodeInIndex(fragment.id(), node.kind, *node_id);

for (auto ctx = mx::TokenContext::of(tok); ctx; ctx = ctx->parent()) {
auto it = ctx_to_node_id.find(ctx->id());

// Add to parent node's children if it already exists

if (it != ctx_to_node_id.end()) {
ast.AddChild(it->second, *node_id);
node_id = std::nullopt;
break;
}

// Otherwise we need to create a new parent node

if (auto decl = mx::Decl::from(*ctx)) {
mx::ASTNode parent{};
parent.kind = mx::syntex::NodeKind{decl->kind()}.Serialize();
parent.entity = decl->id();
parent.prev = ast.GetNodeInIndex(fragment.id(), parent.kind);
auto parent_id = ast.AddNode(parent);
// Add it to the index
ast.SetNodeInIndex(fragment.id(), parent.kind, parent_id);
ctx_to_node_id[ctx->id()] = parent_id;
ast.AddChild(parent_id, *node_id);
node_id = parent_id;
continue;
}

if (auto stmt = mx::Stmt::from(*ctx)) {
mx::ASTNode parent{};
parent.kind = mx::syntex::NodeKind{stmt->kind()}.Serialize();
parent.entity = stmt->id();
parent.prev = ast.GetNodeInIndex(fragment.id(), parent.kind);
auto parent_id = ast.AddNode(parent);
// Add it to the index
ast.SetNodeInIndex(fragment.id(), parent.kind, parent_id);
ctx_to_node_id[ctx->id()] = parent_id;
ast.AddChild(parent_id, *node_id);
node_id = parent_id;
continue;
}
}

// If we didn't add the token to a pre-existing parent, add it to the root

if (node_id.has_value()) {
ast.AddNodeToRoot(fragment.id(), *node_id);
}
}
}

static void ImportGrammar(mx::Fragment fragment, ServerContext& ctx) {
auto &ast = ctx->ast;
auto &grammar = ctx->grammar;
auto nodes = ast.Root(fragment.id());

// Make a production rule for every node and its children.
while (!nodes.empty()) {
auto node_id = nodes.back();
nodes.pop_back();

auto node = ast.GetNode(node_id);
auto node_kind = mx::syntex::NodeKind::Deserialize(node.kind);

if (!node_kind.IsToken()) {
// This is an internal or root node. E.g. given the following:
//
// A
// / | \
// B C D
//
// We want to make a rule of the form `B C D A`, i.e. if you match `B C D`
// then you have matched an `A`. This "backward" syntax enables us to prefix
// scan for left corners (`B` in this case) and find all rules starting with
// `B`.

auto child_vector = ast.GetChildren(node_id);
assert(child_vector.size() >= 1);

// FIXME: do something else with long grammar rules. PHP has
// some generated initializer lists with 100s of elements that
// blows up our stack when serializing a grammar.
if (child_vector.size() > 100) {
continue;
}

// Add the child nodes to the work list.
nodes.insert(nodes.end(), child_vector.begin(), child_vector.end());

// Walk the trie
std::uint64_t leaves_id = 0;
for (auto child_id : child_vector) {
auto child = ast.GetNode(child_id);
leaves_id = grammar.GetChild(leaves_id, child.kind);
}
// Save pointer to rule head
auto head_id = grammar.GetChild(leaves_id, node.kind);

// Avoid creating cyclic CFGs
bool allow_production = true;

if (child_vector.size() == 1) {
std::vector<unsigned short> queue = { node.kind };
while (!queue.empty()) {
auto nt = queue.back();
queue.pop_back();

// Check if we can reach our own left corner
auto child = ast.GetNode(child_vector[0]);
if (nt == child.kind) {
allow_production = false;
break;
}

// Queue result of matching trivial productions
for(auto [left, rest] : grammar.GetChildLeaves(0, nt)) {
auto node = grammar.GetNode(rest);
if(node.is_production) {
queue.push_back(left);
}
}
}
}

// Mark the head as a production if appropriate
grammar.UpdateNode(head_id, {allow_production});
}
}
}

void BuildAST(mx::Index index, ServerContext &context) {
for(auto file : mx::File::in(index)) {
for(auto fragment : mx::Fragment::in(file)) {
sqlite::Transaction tx(context.db);
std::scoped_lock<sqlite::Transaction> lock(tx);
SerializeAST(fragment, context);
ImportGrammar(fragment, context);
}
}
}
} // namespace indexer
17 changes: 17 additions & 0 deletions bin/Index/BuildAST.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (c) 2022-present, Trail of Bits, Inc.
// All rights reserved.
//
// This source code is licensed in accordance with the terms specified in
// the LICENSE file found in the root directory of this source tree.

#pragma once

#include <memory>
#include <multiplier/Index.h>
#include "Context.h"

namespace indexer {

void BuildAST(mx::Index index, ServerContext& context);

} // namespace indexer
3 changes: 3 additions & 0 deletions bin/Index/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
set(exe_name "mx-index")

add_executable("${exe_name}"
"BuildAST.cpp"
"BuildAST.h"
"BuildPendingFragment.cpp"
"Compress.cpp"
"Compress.h"
Expand Down Expand Up @@ -57,6 +59,7 @@ target_link_libraries("${exe_name}"
PRIVATE
${MX_BEGIN_FORCE_LOAD_GROUP}
"mx-util"
"mx-api"
"concurrentqueue"
${MX_BEGIN_FORCE_LOAD_LIB} pasta::pasta ${MX_END_FORCE_LOAD_LIB}
${MX_END_FORCE_LOAD_GROUP}
Expand Down
4 changes: 4 additions & 0 deletions bin/Index/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "Context.h"
#include "Parser.h"
#include "Importer.h"
#include "BuildAST.h"

// Should we show a help message?
DECLARE_bool(help);
Expand Down Expand Up @@ -158,5 +159,8 @@ extern "C" int main(int argc, char *argv[]) {
executor.Start();
executor.Wait();

auto index = mx::Index(mx::EntityProvider::from_database(FLAGS_db));
indexer::BuildAST(index, ic->server_context[0]);

return EXIT_SUCCESS;
}
35 changes: 35 additions & 0 deletions bin/Query/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#
# Copyright (c) 2022-present, Trail of Bits, Inc.
# All rights reserved.
#
# This source code is licensed in accordance with the terms specified in
# the LICENSE file found in the root directory of this source tree.
#

add_executable("syntex-query" "SyntexQuery.cpp")

target_link_libraries("syntex-query"
PRIVATE
gflags
glog::glog
"mx-api"
)

install(
TARGETS
"syntex-query"
EXPORT
"${PROJECT_NAME}Targets"
RUNTIME
DESTINATION
"${CMAKE_INSTALL_BINDIR}"
)

add_executable("predicate-example" "PredicateExample.cpp")

target_link_libraries("predicate-example"
PRIVATE
gflags
glog::glog
"mx-api"
)
Loading