Skip to content

Commit

Permalink
Optimizer improvements - implement #4
Browse files Browse the repository at this point in the history
  • Loading branch information
markpapadakis committed May 26, 2017
1 parent fcc71db commit a6ada57
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 42 deletions.
186 changes: 155 additions & 31 deletions exec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,36 +41,6 @@ namespace // static/local this module
exec_node expr;
};

struct phrase final
{
uint8_t size;
exec_term_id_t termIDs[0];

auto operator==(const phrase &o) const noexcept
{
if (size == o.size)
{
for (uint32_t i{0}; i != size; ++i)
{
if (termIDs[i] != o.termIDs[i])
return false;
}
return true;
}
else
return false;
}

bool is_set(const exec_term_id_t id) const noexcept
{
for (uint32_t i{0}; i != size; ++i)
{
if (termIDs[i] == id)
return true;
}
return false;
}
};

struct termsrun final
{
Expand Down Expand Up @@ -174,6 +144,78 @@ namespace // static/local this module
}
};

struct phrase final
{
uint8_t size;
exec_term_id_t termIDs[0];

uint16_t intersection(const termsrun *const tr, exec_term_id_t *const out) const noexcept
{
uint16_t n{0};

for (uint32_t i{0}; i != size; ++i)
{
if (const auto id = termIDs[i]; tr->is_set(id))
out[n++] = id;
}
return n;
}

// returns terms found in run, but missing from this phrase
uint16_t disjoint_union(const termsrun *const tr, exec_term_id_t *const out) const noexcept
{
uint16_t n{0};
const auto cnt = tr->size;

for (uint32_t i{0}; i != cnt; ++i)
{
if (const auto id = tr->terms[i]; !is_set(id))
out[n++] = id;
}
return n;
}

bool intersected_by(const termsrun *const tr) const noexcept
{
if (tr->size >= size)
{
for (uint32_t i{0}; i != size;++i)
{
if (!tr->is_set(termIDs[i]))
return false;
}
return true;
}
else
return false;
}

auto operator==(const phrase &o) const noexcept
{
if (size == o.size)
{
for (uint32_t i{0}; i != size; ++i)
{
if (termIDs[i] != o.termIDs[i])
return false;
}
return true;
}
else
return false;
}

bool is_set(const exec_term_id_t id) const noexcept
{
for (uint32_t i{0}; i != size; ++i)
{
if (termIDs[i] == id)
return true;
}
return false;
}
};

struct cacheable_termsrun
{
docid_t lastConsideredDID;
Expand Down Expand Up @@ -325,7 +367,7 @@ namespace // static/local this module
const auto tctx = idxsrc->term_ctx(term);

if (traceCompile)
SLog(ansifmt::bold, ansifmt::color_green, "[", term, "] ", termsDict.size(), ", documents = ", tctx.documents, ansifmt::reset, "\n");
SLog(ansifmt::bold, ansifmt::color_green, "[", term, "] ", termsDict.size(), ", documents = ", tctx.documents, ansifmt::reset, " (", ptr_repr(this), ")\n");

if (tctx.documents == 0)
{
Expand Down Expand Up @@ -1834,6 +1876,32 @@ static exec_node optimize_node(exec_node n, runtime_ctx &rctx, simple_allocator
set_dirty();
return n;
}
else if (ctx->lhs.fp == matchphrase_impl && ctx->rhs.fp == matchallterms_impl)
{
// ([1,2] OR ALL OF[3,1,2]) => ALL OF [3,1,2]
const auto *const __restrict__ run = (runtime_ctx::termsrun *)ctx->rhs.ptr;
const auto *const phrase = (runtime_ctx::phrase *)ctx->lhs.ptr;

if (phrase->intersected_by(run))
{
n = ctx->rhs;
set_dirty();
return n;
}
}
else if (ctx->lhs.fp == matchallterms_impl && ctx->rhs.fp == matchphrase_impl)
{
// ([1,2] OR ALL OF[1,3,2]) => ALL OF [1,3,2]
const auto *const __restrict__ run = (runtime_ctx::termsrun *)ctx->lhs.ptr;
const auto *const phrase = (runtime_ctx::phrase *)ctx->rhs.ptr;

if (phrase->intersected_by(run))
{
n = ctx->lhs;
set_dirty();
return n;
}
}
}
else if (n.fp == logicaland_impl)
{
Expand Down Expand Up @@ -1959,6 +2027,62 @@ static exec_node optimize_node(exec_node n, runtime_ctx &rctx, simple_allocator
}
}
}

if (ctx->lhs.fp == matchallterms_impl && ctx->rhs.fp == matchphrase_impl)
{
// ( pc game hitman ) AND "pc game"
// hitman AND "pc game"
// this is somewhat expensive, but worth it
auto run = static_cast<runtime_ctx::termsrun *>(ctx->lhs.ptr);
const auto phr = static_cast<const runtime_ctx::phrase *>(ctx->rhs.ptr);

if (phr->size < 128) //arbitrary
{
exec_term_id_t terms[128];

const auto cnt = phr->disjoint_union(run, terms);

if (cnt == 0)
{
n = ctx->rhs;
set_dirty();
return n;
}
else if (cnt < run->size)
{
run->size = cnt;
memcpy(run->terms, terms, sizeof(terms[0]) * cnt);
}
}
}

if (ctx->rhs.fp == matchallterms_impl && ctx->lhs.fp == matchphrase_impl)
{
// "pc game" AND ( pc game hitman )
// "pc game" AND hitman
// this is somewhat expensive, but worth it
auto run = static_cast<runtime_ctx::termsrun *>(ctx->rhs.ptr);
const auto phr = static_cast<const runtime_ctx::phrase *>(ctx->lhs.ptr);

if (phr->size < 128) //arbitrary
{
exec_term_id_t terms[128];

const auto cnt = phr->disjoint_union(run, terms);

if (cnt == 0)
{
n = ctx->rhs;
set_dirty();
return n;
}
else if (cnt < run->size)
{
run->size = cnt;
memcpy(run->terms, terms, sizeof(terms[0]) * cnt);
}
}
}
}
else if (n.fp == logicalnot_impl)
{
Expand Down
11 changes: 11 additions & 0 deletions index_source.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,30 @@ namespace Trinity

term_index_ctx term_ctx(const str8_t term)
{
[[maybe_unused]] static constexpr bool trace{false};
std::lock_guard<std::mutex> g(cacheLock);

#ifndef LEAN_SWITCH
term_index_ctx *ptr;

if (cache.Add(term, {}, &ptr))
{
*ptr = resolve_term_ctx(term);

if (trace)
SLog("RESOLVED [", term, "] ", ptr->documents, " ", ptr_repr(this), "\n");
}
return *ptr;
#else
auto p = cache.insert({term, {}});
if (p.second)
{
p.first->second = resolve_term_ctx(term);

if (trace)
SLog("RESOLVED [", term, "] ", p.first->second.documents, " ", ptr_repr(this), "\n");
}

return p.first->second;
#endif
}
Expand Down
24 changes: 14 additions & 10 deletions queries.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,14 @@ static std::pair<Operator, uint8_t> parse_operator_impl(ast_parser &ctx)
switch (f)
{
case '+':
return {Operator::STRICT_AND, 1};
if (char_t c; s.size() > 1 && (c = s.p[1]) && !isspace(c) && c != '+')
{
// TODO: this should have been a bit more sophisticated
// e.g it shouldn't return an operator for +< or other non alphanumeric characters
return {Operator::STRICT_AND, 1};
}
break;

case '-':
return {Operator::NOT, 1};
}
Expand All @@ -183,15 +190,6 @@ static std::pair<Operator, uint8_t> parse_operator_impl(ast_parser &ctx)
return {Operator::NONE, 0};
else
return {Operator::AND, 0};

#if 0
if (isalnum(f) || f == '\"' || f == '(')
return {Operator::AND, 0};
else
return {Operator::NONE, 0};
#else
return {Operator::AND, 0};
#endif
}
else
return {Operator::NONE, 0};
Expand Down Expand Up @@ -1555,6 +1553,12 @@ std::pair<uint32_t, uint8_t> Trinity::default_token_parser_impl(const Trinity::s
const auto *p = content.begin(), *const e = content.end(), *const b{p};
bool allAlphas{true};

if (*p == '+' && (p + 1 == e || (p[1] != '+' && p[1] != '-' && !isalnum(p[1]))))
{
str32_t(_S("PLUS")).CopyTo(out);
return {1, "PLUS"_len};
}

if (p + 4 < e && isalpha(*p) && p[1] == '.' && isalnum(p[2]) && p[3] == '.' && isalpha(p[4]))
{
// Acronyms with punctuations
Expand Down
13 changes: 12 additions & 1 deletion queries.h
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,11 @@ namespace Trinity
// the source query to go away
static void bind_tokens_to_allocator(ast_node *, simple_allocator *);

query() = default;
query()
: root{nullptr}, tokensParser{nullptr}
{

}

inline operator bool() const noexcept
{
Expand Down Expand Up @@ -547,6 +551,13 @@ namespace Trinity
cb(run);
}
}


void reset()
{
root = nullptr;
allocator.reuse();
}
};
}

Expand Down
5 changes: 5 additions & 0 deletions queries_rewrite.h
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,11 @@ namespace Trinity
void rewrite_query(Trinity::query &q, size_t budget, const uint8_t K, L &&l)
{
static constexpr bool trace{false};

if (!q)
return;


const auto before = Timings::Microseconds::Tick();
auto &allocator = q.allocator;
static thread_local gen_ctx genCtx;
Expand Down

0 comments on commit a6ada57

Please sign in to comment.