From 8a68213c952fd539eb40c9b357e19bb82a946e17 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Sat, 3 Aug 2024 17:31:40 +0200 Subject: [PATCH 1/2] Fix a bug in the `VariableToColumnMap` of `TransitivePath` (#1432) There was a subtle and long-standing bug in the computation of the result width of a transititve path operation. This is now fixed. This fixes queries like https://qlever.cs.uni-freiburg.de/wikidata/zCHu9V (which without this fix has its third column wrong) and https://qlever.cs.uni-freiburg.de/wikidata/S1XA29 (which without this fix runs into an assertion failue). --- src/engine/TransitivePathBase.cpp | 4 +- test/TransitivePathTest.cpp | 77 +++++++++++++++++++++---------- 2 files changed, 56 insertions(+), 25 deletions(-) diff --git a/src/engine/TransitivePathBase.cpp b/src/engine/TransitivePathBase.cpp index b0e11eec0e..85eaa3236c 100644 --- a/src/engine/TransitivePathBase.cpp +++ b/src/engine/TransitivePathBase.cpp @@ -378,8 +378,8 @@ std::shared_ptr TransitivePathBase::bindLeftOrRightSide( AD_CORRECTNESS_CHECK(!p->variableColumns_.contains(variable)); p->variableColumns_[variable] = columnIndexWithType; - p->resultWidth_++; } + p->resultWidth_ += leftOrRightOp->getResultWidth() - 1; return std::move(p); } @@ -397,6 +397,8 @@ void TransitivePathBase::copyColumns(const IdTableView& inputTable, size_t skipCol) const { size_t inCol = 0; size_t outCol = 2; + AD_CORRECTNESS_CHECK(skipCol < inputTable.numColumns()); + AD_CORRECTNESS_CHECK(inputTable.numColumns() + 1 == outputTable.numColumns()); while (inCol < inputTable.numColumns() && outCol < outputTable.numColumns()) { if (skipCol == inCol) { inCol++; diff --git a/test/TransitivePathTest.cpp b/test/TransitivePathTest.cpp index 651faa9442..e616ad2e2b 100644 --- a/test/TransitivePathTest.cpp +++ b/test/TransitivePathTest.cpp @@ -247,14 +247,27 @@ TEST_P(TransitivePathTest, idToLeftBound) { TransitivePathSide left(std::nullopt, 0, Variable{"?start"}, 0); TransitivePathSide right(std::nullopt, 1, V(4), 1); - auto T = makePathLeftBound( - std::move(sub), {Variable{"?start"}, Variable{"?target"}}, - std::move(leftOpTable), 1, {Variable{"?x"}, Variable{"?start"}}, - std::move(left), std::move(right), 0, std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + { + auto T = makePathLeftBound( + sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + leftOpTable.clone(), 1, {Variable{"?x"}, Variable{"?start"}}, left, + right, 0, std::numeric_limits::max()); + + auto resultTable = T->computeResultOnlyForTesting(); + ASSERT_THAT(resultTable.idTable(), + ::testing::UnorderedElementsAreArray(expected)); + } + { + auto T = makePathLeftBound( + std::move(sub), {Variable{"?start"}, Variable{"?target"}}, + std::move(leftOpTable), 1, {std::nullopt, Variable{"?start"}}, + std::move(left), std::move(right), 0, + std::numeric_limits::max()); + + auto resultTable = T->computeResultOnlyForTesting(); + ASSERT_THAT(resultTable.idTable(), + ::testing::UnorderedElementsAreArray(expected)); + } } TEST_P(TransitivePathTest, idToRightBound) { @@ -280,14 +293,27 @@ TEST_P(TransitivePathTest, idToRightBound) { TransitivePathSide left(std::nullopt, 0, V(0), 0); TransitivePathSide right(std::nullopt, 1, Variable{"?target"}, 1); - auto T = makePathRightBound( - std::move(sub), {Variable{"?start"}, Variable{"?target"}}, - std::move(rightOpTable), 0, {Variable{"?target"}, Variable{"?x"}}, - std::move(left), std::move(right), 0, std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + { + auto T = makePathRightBound( + sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + rightOpTable.clone(), 0, {Variable{"?target"}, Variable{"?x"}}, left, + right, 0, std::numeric_limits::max()); + + auto resultTable = T->computeResultOnlyForTesting(); + ASSERT_THAT(resultTable.idTable(), + ::testing::UnorderedElementsAreArray(expected)); + } + { + auto T = makePathRightBound( + std::move(sub), {Variable{"?start"}, Variable{"?target"}}, + std::move(rightOpTable), 0, {Variable{"?target"}, std::nullopt}, + std::move(left), std::move(right), 0, + std::numeric_limits::max()); + + auto resultTable = T->computeResultOnlyForTesting(); + ASSERT_THAT(resultTable.idTable(), + ::testing::UnorderedElementsAreArray(expected)); + } } TEST_P(TransitivePathTest, leftBoundToVar) { @@ -318,14 +344,17 @@ TEST_P(TransitivePathTest, leftBoundToVar) { TransitivePathSide left(std::nullopt, 0, Variable{"?start"}, 0); TransitivePathSide right(std::nullopt, 1, Variable{"?target"}, 1); - auto T = makePathLeftBound( - std::move(sub), {Variable{"?start"}, Variable{"?target"}}, - std::move(leftOpTable), 1, {Variable{"?x"}, Variable{"?start"}}, - std::move(left), std::move(right), 0, std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + { + auto T = makePathLeftBound( + std::move(sub), {Variable{"?start"}, Variable{"?target"}}, + std::move(leftOpTable), 1, {Variable{"?x"}, Variable{"?start"}}, + std::move(left), std::move(right), 0, + std::numeric_limits::max()); + + auto resultTable = T->computeResultOnlyForTesting(); + ASSERT_THAT(resultTable.idTable(), + ::testing::UnorderedElementsAreArray(expected)); + } } TEST_P(TransitivePathTest, rightBoundToVar) { From 0b9d26f55f2c6632ddaef756b124acea8140d904 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Sat, 3 Aug 2024 19:05:32 +0200 Subject: [PATCH 2/2] Support triple patterns with zero variables (#1395) So far, triple patterns in a query had to contain at least one variable. But the SPARQL 1.1 standard also supports triple patterns with no variables, like in SELECT * WHERE { wd:Q42 wdt:P31 wd:Q5 ... }. The semantics is that if the triple exists in the dataset, the triple pattern acts as the neutral element (that is, as if it weren't there), and if it does not exist in the dataset, it acts as the zero element (that is, the result of the whole graph pattern will be empty). This is now implemented, here is an example query: https://qlever.cs.uni-freiburg.de/wikidata/TkjahB . In particular, this fixes #835. As part of this fix, the code is refactored and simplified significantly, in particular: The ScanSpecification class, which so far was used only in the index classes, now has a sibling ScanSpecificationAsTripleComponent, which is now also used in the class for the IndexScan operation. Remove significant amounts of redundant code from the time, when index scans with one variable were executed (and cached) at query planning time. Remove the associated special case (used in testing), of an index scan without query execution context. --- src/engine/CartesianProductJoin.cpp | 15 ++- src/engine/CartesianProductJoin.h | 6 +- src/engine/IndexScan.cpp | 142 ++++++++--------------- src/engine/IndexScan.h | 1 + src/engine/Operation.cpp | 21 +--- src/engine/Operation.h | 2 +- src/engine/QueryExecutionContext.h | 31 +---- src/engine/QueryPlanner.cpp | 9 +- src/engine/Server.cpp | 1 - src/engine/idTable/IdTable.h | 4 +- src/index/CMakeLists.txt | 2 +- src/index/CompressedRelation.h | 43 +------ src/index/Index.cpp | 19 ++- src/index/Index.h | 19 ++- src/index/IndexImpl.cpp | 35 +++--- src/index/IndexImpl.h | 20 ++-- src/index/Permutation.h | 2 - src/index/ScanSpecification.cpp | 89 ++++++++++++++ src/index/ScanSpecification.h | 81 +++++++++++++ src/parser/CMakeLists.txt | 2 +- src/util/ConcurrentCache.h | 2 +- test/CompressedRelationsTest.cpp | 7 +- test/IndexTest.cpp | 9 +- test/QueryPlannerTest.cpp | 74 ++++++------ test/QueryPlannerTestHelpers.h | 3 + test/TriplesViewTest.cpp | 2 +- test/ValuesForTestingTest.cpp | 16 ++- test/engine/CartesianProductJoinTest.cpp | 16 ++- test/engine/IndexScanTest.cpp | 64 ++++++++++ test/engine/ValuesForTesting.h | 3 +- test/index/CMakeLists.txt | 1 + test/index/ScanSpecificationTest.cpp | 81 +++++++++++++ test/util/IndexTestHelpers.cpp | 7 +- 33 files changed, 521 insertions(+), 308 deletions(-) create mode 100644 src/index/ScanSpecification.cpp create mode 100644 src/index/ScanSpecification.h create mode 100644 test/index/ScanSpecificationTest.cpp diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp index 8f8147f5c2..c73361e001 100644 --- a/src/engine/CartesianProductJoin.cpp +++ b/src/engine/CartesianProductJoin.cpp @@ -154,10 +154,19 @@ ProtoResult CartesianProductJoin::computeResult( child.setLimit(limitIfPresent.value()); } subResults.push_back(child.getResult()); + + const auto& table = subResults.back()->idTable(); // Early stopping: If one of the results is empty, we can stop early. - if (subResults.back()->idTable().size() == 0) { + if (table.empty()) { break; } + + // If one of the children is the neutral element (because of a triple with + // zero variables), we can simply ignore it here. + if (table.numRows() == 1 && table.numColumns() == 0) { + subResults.pop_back(); + continue; + } // Example for the following calculation: If we have a LIMIT of 1000 and // the first child already has a result of size 100, then the second child // needs to evaluate only its first 10 results. The +1 is because integer @@ -169,6 +178,10 @@ ProtoResult CartesianProductJoin::computeResult( } } + // TODO Find a solution to cheaply handle the case, that only a + // single result is left. This can probably be done by using the + // `ProtoResult`. + auto sizesView = std::views::transform( subResults, [](const auto& child) { return child->idTable().size(); }); auto totalResultSize = std::accumulate(sizesView.begin(), sizesView.end(), diff --git a/src/engine/CartesianProductJoin.h b/src/engine/CartesianProductJoin.h index 96047d62fa..779adf1dba 100644 --- a/src/engine/CartesianProductJoin.h +++ b/src/engine/CartesianProductJoin.h @@ -2,9 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#ifndef QLEVER_CARTESIANPRODUCTJOIN_H -#define QLEVER_CARTESIANPRODUCTJOIN_H - +#pragma once #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" @@ -92,5 +90,3 @@ class CartesianProductJoin : public Operation { std::span inputColumn, size_t groupSize, size_t offset); }; - -#endif // QLEVER_CARTESIANPRODUCTJOIN_H diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index 88f26b7149..15940b525b 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -6,6 +6,7 @@ #include +#include #include #include @@ -26,17 +27,19 @@ IndexScan::IndexScan(QueryExecutionContext* qec, Permutation::Enum permutation, numVariables_(static_cast(subject_.isVariable()) + static_cast(predicate_.isVariable()) + static_cast(object_.isVariable())) { + // We previously had `nullptr`s here in unit tests. This is no longer + // necessary nor allowed. + AD_CONTRACT_CHECK(qec != nullptr); for (auto& [idx, variable] : triple.additionalScanColumns_) { additionalColumns_.push_back(idx); additionalVariables_.push_back(variable); } sizeEstimate_ = computeSizeEstimate(); - // Check the following invariant: The permuted input triple must contain at - // least one variable, and all the variables must be at the end of the + // Check the following invariant: All the variables must be at the end of the // permuted triple. For example in the PSO permutation, either only the O, or - // the S and O, or all three of P, S, O can be variables, all other - // combinations are not supported. + // the S and O, or all three of P, S, O, or none of them can be variables, all + // other combinations are not supported. auto permutedTriple = getPermutedTriple(); for (size_t i = 0; i < 3 - numVariables_; ++i) { AD_CONTRACT_CHECK(!permutedTriple.at(i)->isVariable()); @@ -57,7 +60,7 @@ string IndexScan::getCacheKeyImpl() const { auto permutationString = Permutation::toString(permutation_); if (numVariables_ == 3) { - os << "SCAN FOR FULL INDEX " << permutationString << " (DUMMY OPERATION)"; + os << "SCAN FOR FULL INDEX " << permutationString; } else { os << "SCAN " << permutationString << " with "; @@ -66,10 +69,9 @@ string IndexScan::getCacheKeyImpl() const { const auto& key = getPermutedTriple().at(idx)->toRdfLiteral(); os << keyString << " = \"" << key << "\""; }; - addKey(0); - if (numVariables_ == 1) { + for (size_t i = 0; i < 3 - numVariables_; ++i) { + addKey(i); os << ", "; - addKey(1); } } if (!additionalColumns_.empty()) { @@ -92,16 +94,8 @@ size_t IndexScan::getResultWidth() const { // _____________________________________________________________________________ vector IndexScan::resultSortedOn() const { - switch (numVariables_) { - case 1: - return {ColumnIndex{0}}; - case 2: - return {ColumnIndex{0}, ColumnIndex{1}}; - case 3: - return {ColumnIndex{0}, ColumnIndex{1}, ColumnIndex{2}}; - default: - AD_FAIL(); - } + auto resAsView = ad_utility::integerRange(ColumnIndex{numVariables_}); + return std::vector{resAsView.begin(), resAsView.end()}; } // _____________________________________________________________________________ @@ -130,12 +124,8 @@ ProtoResult IndexScan::computeResult([[maybe_unused]] bool requestLaziness) { using enum Permutation::Enum; idTable.setNumColumns(numVariables_); const auto& index = _executionContext->getIndex(); - const auto permutedTriple = getPermutedTriple(); - if (numVariables_ == 2) { - idTable = index.scan(*permutedTriple[0], std::nullopt, permutation_, - additionalColumns(), cancellationHandle_, getLimit()); - } else if (numVariables_ == 1) { - idTable = index.scan(*permutedTriple[0], *permutedTriple[1], permutation_, + if (numVariables_ < 3) { + idTable = index.scan(getScanSpecification(), permutation_, additionalColumns(), cancellationHandle_, getLimit()); } else { AD_CORRECTNESS_CHECK(numVariables_ == 3); @@ -150,44 +140,19 @@ ProtoResult IndexScan::computeResult([[maybe_unused]] bool requestLaziness) { // _____________________________________________________________________________ size_t IndexScan::computeSizeEstimate() const { - if (_executionContext) { - // Should always be in this branch. Else is only for test cases. - - // We have to do a simple scan anyway so might as well do it now - if (numVariables_ == 1) { - // TODO Use the monadic operation `std::optional::or_else`. - // Note: we cannot use `optional::value_or()` here, because the else - // case is expensive to compute, and we need it lazily evaluated. - if (auto size = getExecutionContext()->getQueryTreeCache().getPinnedSize( - getCacheKey()); - size.has_value()) { - return size.value(); - } else { - // This call explicitly has to read two blocks of triples from memory to - // obtain an exact size estimate. - return getIndex().getResultSizeOfScan( - *getPermutedTriple()[0], *getPermutedTriple().at(1), permutation_); - } - } else if (numVariables_ == 2) { - const TripleComponent& firstKey = *getPermutedTriple().at(0); - return getIndex().getCardinality(firstKey, permutation_); - } else { - // The triple consists of three variables. - // TODO As soon as all implementations of a full index scan - // (Including the "dummy joins" in Join.cpp) consistently exclude the - // internal triples, this estimate should be changed to only return - // the number of triples in the actual knowledge graph (excluding the - // internal triples). - AD_CORRECTNESS_CHECK(numVariables_ == 3); - return getIndex().numTriples().normalAndInternal_(); - } + AD_CORRECTNESS_CHECK(_executionContext); + // We have to do a simple scan anyway so might as well do it now + if (numVariables_ < 3) { + return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_); } else { - // Only for test cases. The handling of the objects is to make the - // strange query planner tests pass. - auto strLen = [](const auto& el) { - return (el.isString() ? el.getString() : el.toString()).size(); - }; - return 1000 + strLen(subject_) + strLen(object_) + strLen(predicate_); + // The triple consists of three variables. + // TODO As soon as all implementations of a full index scan + // (Including the "dummy joins" in Join.cpp) consistently exclude the + // internal triples, this estimate should be changed to only return + // the number of triples in the actual knowledge graph (excluding the + // internal triples). + AD_CORRECTNESS_CHECK(numVariables_ == 3); + return getIndex().numTriples().normalAndInternal_(); } } @@ -200,29 +165,20 @@ size_t IndexScan::getCostEstimate() { // _____________________________________________________________________________ void IndexScan::determineMultiplicities() { - multiplicity_.clear(); - if (_executionContext) { + multiplicity_ = [this]() -> std::vector { const auto& idx = getIndex(); - if (numVariables_ == 1) { + if (numVariables_ == 0) { + return {}; + } else if (numVariables_ == 1) { // There are no duplicate triples in RDF and two elements are fixed. - multiplicity_.emplace_back(1); + return {1.0f}; } else if (numVariables_ == 2) { - const auto permutedTriple = getPermutedTriple(); - multiplicity_ = idx.getMultiplicities(*permutedTriple[0], permutation_); + return idx.getMultiplicities(*getPermutedTriple()[0], permutation_); } else { AD_CORRECTNESS_CHECK(numVariables_ == 3); - multiplicity_ = idx.getMultiplicities(permutation_); - } - } else { - // This branch is only used in certain unit tests. - multiplicity_.emplace_back(1); - if (numVariables_ == 2) { - multiplicity_.emplace_back(1); - } - if (numVariables_ == 3) { - multiplicity_.emplace_back(1); + return idx.getMultiplicities(permutation_); } - } + }(); for ([[maybe_unused]] size_t i : std::views::iota(multiplicity_.size(), getResultWidth())) { multiplicity_.emplace_back(1); @@ -277,6 +233,12 @@ std::array IndexScan::getPermutedTriple() triple[permutation[2]]}; } +// ___________________________________________________________________________ +ScanSpecificationAsTripleComponent IndexScan::getScanSpecification() const { + auto permutedTriple = getPermutedTriple(); + return {*permutedTriple[0], *permutedTriple[1], *permutedTriple[2]}; +} + // ___________________________________________________________________________ Permutation::IdTableGenerator IndexScan::getLazyScan( const IndexScan& s, std::vector blocks) { @@ -290,6 +252,10 @@ Permutation::IdTableGenerator IndexScan::getLazyScan( col1Id = s.getPermutedTriple()[1]->toValueId(index.getVocab()).value(); } + // This function is currently only called by the `getLazyScanForJoin...` + // functions. In these cases we always have at least one variable in each of + // the scans, because otherwise there would be no join column. + AD_CORRECTNESS_CHECK(s.numVariables_ >= 1); // If there is a LIMIT or OFFSET clause that constrains the scan // (which can happen with an explicit subquery), we cannot use the prefiltered // blocks, as we currently have no mechanism to include limits and offsets @@ -306,28 +272,20 @@ Permutation::IdTableGenerator IndexScan::getLazyScan( // ________________________________________________________________ std::optional IndexScan::getMetadataForScan( const IndexScan& s) { - auto permutedTriple = s.getPermutedTriple(); - const IndexImpl& index = s.getIndex().getImpl(); - auto numVars = s.numVariables_; - std::optional col0Id = - numVars == 3 ? std::nullopt - : permutedTriple[0]->toValueId(index.getVocab()); - std::optional col1Id = - numVars >= 2 ? std::nullopt - : permutedTriple[1]->toValueId(index.getVocab()); - if ((!col0Id.has_value() && numVars < 3) || - (!col1Id.has_value() && numVars < 2)) { + const auto& index = s.getExecutionContext()->getIndex().getImpl(); + auto scanSpec = s.getScanSpecification().toScanSpecification(index); + if (!scanSpec.has_value()) { return std::nullopt; } - return index.getPermutation(s.permutation()) - .getMetadataAndBlocks({col0Id, col1Id, std::nullopt}); + .getMetadataAndBlocks(scanSpec.value()); }; // ________________________________________________________________ std::array IndexScan::lazyScanForJoinOfTwoScans(const IndexScan& s1, const IndexScan& s2) { AD_CONTRACT_CHECK(s1.numVariables_ <= 3 && s2.numVariables_ <= 3); + AD_CONTRACT_CHECK(s1.numVariables_ >= 1 && s2.numVariables_ >= 1); // This function only works for single column joins. This means that the first // variable of both scans must be equal, but all other variables of the scans @@ -376,7 +334,7 @@ IndexScan::lazyScanForJoinOfTwoScans(const IndexScan& s1, const IndexScan& s2) { Permutation::IdTableGenerator IndexScan::lazyScanForJoinOfColumnWithScan( std::span joinColumn, const IndexScan& s) { AD_EXPENSIVE_CHECK(std::ranges::is_sorted(joinColumn)); - AD_CORRECTNESS_CHECK(s.numVariables_ <= 3); + AD_CORRECTNESS_CHECK(s.numVariables_ <= 3 && s.numVariables_ > 0); auto metaBlocks1 = getMetadataForScan(s); diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index 61daa42f06..5914e27d71 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -100,6 +100,7 @@ class IndexScan final : public Operation { // `permutation_`. For example if `permutation_ == PSO` then the result is // {&predicate_, &subject_, &object_} std::array getPermutedTriple() const; + ScanSpecificationAsTripleComponent getScanSpecification() const; private: ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override; diff --git a/src/engine/Operation.cpp b/src/engine/Operation.cpp index cd38e3d504..4293aa2c9c 100644 --- a/src/engine/Operation.cpp +++ b/src/engine/Operation.cpp @@ -88,25 +88,6 @@ std::shared_ptr Operation::getResult( const bool pinResult = _executionContext->_pinSubtrees || pinFinalResultButNotSubtrees; - // When we pin the final result but no subtrees, we need to remember the sizes - // of all involved index scans that have only one free variable. Note that - // these index scans are executed already during query planning because they - // have to be executed anyway, for any query plan. If we don't remember these - // sizes here, future queries that take the result from the cache would redo - // these index scans. Note that we do not need to remember the multiplicity - // (and distinctness) because the multiplicity for an index scan with a single - // free variable is always 1. - if (pinFinalResultButNotSubtrees) { - auto lock = - getExecutionContext()->getQueryTreeCache().pinnedSizes().wlock(); - forAllDescendants([&lock](QueryExecutionTree* child) { - if (child->getRootOperation()->isIndexScanWithNumVariables(1)) { - (*lock)[child->getRootOperation()->getCacheKey()] = - child->getSizeEstimate(); - } - }); - } - try { // In case of an exception, create the correct runtime info, no matter which // exception handler is called. @@ -270,7 +251,7 @@ void Operation::updateRuntimeInformationOnSuccess( // ____________________________________________________________________________________________________________________ void Operation::updateRuntimeInformationOnSuccess( - const ConcurrentLruCache ::ResultAndCacheStatus& resultAndCacheStatus, + const QueryResultCache::ResultAndCacheStatus& resultAndCacheStatus, Milliseconds duration) { updateRuntimeInformationOnSuccess( *resultAndCacheStatus._resultPointer->resultTable(), diff --git a/src/engine/Operation.h b/src/engine/Operation.h index 920b09594f..9e23ee3fcf 100644 --- a/src/engine/Operation.h +++ b/src/engine/Operation.h @@ -263,7 +263,7 @@ class Operation { // Create and store the complete runtime information for this operation after // it has either been successfully computed or read from the cache. virtual void updateRuntimeInformationOnSuccess( - const ConcurrentLruCache::ResultAndCacheStatus& resultAndCacheStatus, + const QueryResultCache::ResultAndCacheStatus& resultAndCacheStatus, Milliseconds duration) final; // Similar to the function above, but the components are specified manually. diff --git a/src/engine/QueryExecutionContext.h b/src/engine/QueryExecutionContext.h index ba2bc24487..e7c1ff37f6 100644 --- a/src/engine/QueryExecutionContext.h +++ b/src/engine/QueryExecutionContext.h @@ -53,37 +53,8 @@ class CacheValue { // Threadsafe LRU cache for (partial) query results, that // checks on insertion, if the result is currently being computed // by another query. -using ConcurrentLruCache = ad_utility::ConcurrentCache< +using QueryResultCache = ad_utility::ConcurrentCache< ad_utility::LRUCache>; -using PinnedSizes = - ad_utility::Synchronized, - std::shared_mutex>; -class QueryResultCache : public ConcurrentLruCache { - private: - PinnedSizes _pinnedSizes; - - public: - virtual ~QueryResultCache() = default; - void clearAll() override { - // The _pinnedSizes are not part of the (otherwise threadsafe) _cache - // and thus have to be manually locked. - auto lock = _pinnedSizes.wlock(); - ConcurrentLruCache::clearAll(); - lock->clear(); - } - // Inherit the constructor. - using ConcurrentLruCache::ConcurrentLruCache; - const PinnedSizes& pinnedSizes() const { return _pinnedSizes; } - PinnedSizes& pinnedSizes() { return _pinnedSizes; } - std::optional getPinnedSize(const std::string& key) { - auto rlock = _pinnedSizes.rlock(); - if (rlock->contains(key)) { - return rlock->at(key); - } else { - return std::nullopt; - } - } -}; // Execution context for queries. // Holds references to index and engine, implements caching. diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 745b27eddc..26f6077270 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -629,7 +629,10 @@ void QueryPlanner::seedFromOrdinaryTriple( const size_t numVars = static_cast(isVariable(triple.s_)) + static_cast(isVariable(triple.p_)) + static_cast(isVariable(triple.o_)); - if (numVars == 1) { + if (numVars == 0) { + // We could read this from any of the permutations. + addIndexScan(Permutation::Enum::PSO); + } else if (numVars == 1) { indexScanSingleVarCase(triple, addIndexScan); } else if (numVars == 2) { indexScanTwoVarsCase(triple, addIndexScan, addFilter); @@ -673,10 +676,6 @@ auto QueryPlanner::seedWithScansAndText( seeds.push_back(getTextLeafPlan(node, textLimits)); continue; } - if (node._variables.empty()) { - AD_THROW("Triples should have at least one variable. Not the case in: " + - node.triple_.asString()); - } // Property paths must have been handled previously. AD_CORRECTNESS_CHECK(node.triple_.p_._operation == diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 689d58c7c5..245f3cb94a 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -469,7 +469,6 @@ nlohmann::json Server::composeCacheStatsJson() const { // converter. result["non-pinned-size"] = cache_.nonPinnedSize().getBytes(); result["pinned-size"] = cache_.pinnedSize().getBytes(); - result["num-pinned-index-scan-sizes"] = cache_.pinnedSizes().rlock()->size(); return result; } diff --git a/src/engine/idTable/IdTable.h b/src/engine/idTable/IdTable.h index 4377b02445..dd9895810c 100644 --- a/src/engine/idTable/IdTable.h +++ b/src/engine/idTable/IdTable.h @@ -1,5 +1,5 @@ -// Copyright 2021, University of Freiburg, Chair of Algorithms and Data -// Structures. Author: Johannes Kalmbach +// Copyright 2021, University of Freiburg, Chair of Algorithms and Data +// Structures. Author: Johannes Kalmbach #pragma once diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index c8020551f8..e41fd2471f 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -5,5 +5,5 @@ add_library(index LocatedTriples.cpp Permutation.cpp TextMetaData.cpp DocsDB.cpp FTSAlgorithms.cpp PrefixHeuristic.cpp CompressedRelation.cpp - PatternCreator.cpp) + PatternCreator.cpp ScanSpecification.cpp) qlever_target_link_libraries(index util parser vocabulary compilationInfo ${STXXL_LIBRARIES}) diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 6a6b761699..af8725ec9a 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -11,6 +11,7 @@ #include "engine/idTable/IdTable.h" #include "global/Id.h" +#include "index/ScanSpecification.h" #include "parser/data/LimitOffsetClause.h" #include "util/Cache.h" #include "util/CancellationHandle.h" @@ -350,48 +351,6 @@ class CompressedRelationReader { using ColumnIndices = std::vector; using CancellationHandle = ad_utility::SharedCancellationHandle; - // The specification of a scan operation for a given permutation. - // Can either be a full scan (all three elements are `std::nullopt`), - // a scan for a fixed `col0Id`, a scan for a fixed `col0Id` and `col1Id`, - // or even a scan for a single triple to check whether it is contained in - // the knowledge graph at all. The values which are `nullopt` become variables - // and are returned as columns in the result of the scan. - class ScanSpecification { - private: - using T = std::optional; - - T col0Id_; - T col1Id_; - T col2Id_; - - void validate() const { - bool c0 = col0Id_.has_value(); - bool c1 = col1Id_.has_value(); - bool c2 = col2Id_.has_value(); - if (!c0) { - AD_CORRECTNESS_CHECK(!c1 && !c2); - } - if (!c1) { - AD_CORRECTNESS_CHECK(!c2); - } - } - - public: - ScanSpecification(T col0Id, T col1Id, T col2Id) - : col0Id_{col0Id}, col1Id_{col1Id}, col2Id_{col2Id} { - validate(); - } - const T& col0Id() const { return col0Id_; } - const T& col1Id() const { return col1Id_; } - const T& col2Id() const { return col2Id_; } - - // Only used in tests. - void setCol1Id(T col1Id) { - col1Id_ = col1Id; - validate(); - } - }; - // The metadata of a single relation together with a subset of its // blocks and possibly a `col1Id` for additional filtering. This is used as // the input to several functions below that take such an input. diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 468a17a3d7..b23b6629a2 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -276,28 +276,27 @@ vector Index::getMultiplicities(const TripleComponent& key, // ____________________________________________________________________________ IdTable Index::scan( - const TripleComponent& col0String, - std::optional> col1String, + const ScanSpecificationAsTripleComponent& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { - return pimpl_->scan(col0String, col1String, p, additionalColumns, + return pimpl_->scan(scanSpecification, p, additionalColumns, cancellationHandle, limitOffset); } // ____________________________________________________________________________ IdTable Index::scan( - Id col0Id, std::optional col1Id, Permutation::Enum p, + const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { - return pimpl_->scan(col0Id, col1Id, p, additionalColumns, cancellationHandle, - limitOffset); + return pimpl_->scan(scanSpecification, p, additionalColumns, + cancellationHandle, limitOffset); } // ____________________________________________________________________________ -size_t Index::getResultSizeOfScan(const TripleComponent& col0String, - const TripleComponent& col1String, - const Permutation::Enum& permutation) const { - return pimpl_->getResultSizeOfScan(col0String, col1String, permutation); +size_t Index::getResultSizeOfScan( + const ScanSpecificationAsTripleComponent& scanSpecification, + const Permutation::Enum& permutation) const { + return pimpl_->getResultSizeOfScan(scanSpecification, permutation); } diff --git a/src/index/Index.h b/src/index/Index.h index 6ab952e8f5..82534a612f 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -236,24 +236,23 @@ class Index { * @param p The Permutation::Enum to use (in particularly POS(), SOP,... * members of Index class). */ - IdTable scan( - const TripleComponent& col0String, - std::optional> col1String, - Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, - const ad_utility::SharedCancellationHandle& cancellationHandle, - const LimitOffsetClause& limitOffset = {}) const; + IdTable scan(const ScanSpecificationAsTripleComponent& scanSpecification, + Permutation::Enum p, + Permutation::ColumnIndicesRef additionalColumns, + const ad_utility::SharedCancellationHandle& cancellationHandle, + const LimitOffsetClause& limitOffset = {}) const; // Similar to the overload of `scan` above, but the keys are specified as IDs. - IdTable scan(Id col0Id, std::optional col1Id, Permutation::Enum p, + IdTable scan(const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset = {}) const; // Similar to the previous overload of `scan`, but only get the exact size of // the scan result. - size_t getResultSizeOfScan(const TripleComponent& col0String, - const TripleComponent& col1String, - const Permutation::Enum& permutation) const; + size_t getResultSizeOfScan( + const ScanSpecificationAsTripleComponent& scanSpecification, + const Permutation::Enum& permutation) const; // Get access to the implementation. This should be used rarely as it // requires including the rather expensive `IndexImpl.h` header diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 30622cbcf6..c466e75481 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1358,46 +1358,41 @@ vector IndexImpl::getMultiplicities( // _____________________________________________________________________________ IdTable IndexImpl::scan( - const TripleComponent& col0String, - std::optional> col1String, + const ScanSpecificationAsTripleComponent& scanSpecificationAsTc, const Permutation::Enum& permutation, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { - std::optional col0Id = col0String.toValueId(getVocab()); - std::optional col1Id = - col1String.has_value() ? col1String.value().get().toValueId(getVocab()) - : std::nullopt; - if (!col0Id.has_value() || (col1String.has_value() && !col1Id.has_value())) { - size_t numColumns = col1String.has_value() ? 1 : 2; + auto scanSpecification = scanSpecificationAsTc.toScanSpecification(*this); + if (!scanSpecification.has_value()) { cancellationHandle->throwIfCancelled(); - return IdTable{numColumns + additionalColumns.size(), allocator_}; + return IdTable{ + scanSpecificationAsTc.numColumns() + additionalColumns.size(), + allocator_}; } - return scan(col0Id.value(), col1Id, permutation, additionalColumns, + return scan(scanSpecification.value(), permutation, additionalColumns, cancellationHandle, limitOffset); } // _____________________________________________________________________________ IdTable IndexImpl::scan( - Id col0Id, std::optional col1Id, Permutation::Enum p, + const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { - return getPermutation(p).scan({col0Id, col1Id, std::nullopt}, - additionalColumns, cancellationHandle, - limitOffset); + return getPermutation(p).scan(scanSpecification, additionalColumns, + cancellationHandle, limitOffset); } // _____________________________________________________________________________ size_t IndexImpl::getResultSizeOfScan( - const TripleComponent& col0, const TripleComponent& col1, + const ScanSpecificationAsTripleComponent& scanSpecificationAsTc, const Permutation::Enum& permutation) const { - std::optional col0Id = col0.toValueId(getVocab()); - std::optional col1Id = col1.toValueId(getVocab()); - if (!col0Id.has_value() || !col1Id.has_value()) { + const Permutation& p = getPermutation(permutation); + auto scanSpecification = scanSpecificationAsTc.toScanSpecification(*this); + if (!scanSpecification.has_value()) { return 0; } - const Permutation& p = getPermutation(permutation); - return p.getResultSizeOfScan({col0Id.value(), col1Id.value(), std::nullopt}); + return p.getResultSizeOfScan(scanSpecification.value()); } // _____________________________________________________________________________ diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index b39fa7e911..43bccb4ef4 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -418,24 +418,22 @@ class IndexImpl { vector getMultiplicities(Permutation::Enum permutation) const; // _____________________________________________________________________________ - IdTable scan( - const TripleComponent& col0String, - std::optional> col1String, - const Permutation::Enum& permutation, - Permutation::ColumnIndicesRef additionalColumns, - const ad_utility::SharedCancellationHandle& cancellationHandle, - const LimitOffsetClause& limitOffset = {}) const; + IdTable scan(const ScanSpecificationAsTripleComponent& scanSpecification, + const Permutation::Enum& permutation, + Permutation::ColumnIndicesRef additionalColumns, + const ad_utility::SharedCancellationHandle& cancellationHandle, + const LimitOffsetClause& limitOffset = {}) const; // _____________________________________________________________________________ - IdTable scan(Id col0Id, std::optional col1Id, Permutation::Enum p, + IdTable scan(const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset = {}) const; // _____________________________________________________________________________ - size_t getResultSizeOfScan(const TripleComponent& col0, - const TripleComponent& col1, - const Permutation::Enum& permutation) const; + size_t getResultSizeOfScan( + const ScanSpecificationAsTripleComponent& scanSpecification, + const Permutation::Enum& permutation) const; private: // Private member functions diff --git a/src/index/Permutation.h b/src/index/Permutation.h index 2defb4c58e..db4b36635d 100644 --- a/src/index/Permutation.h +++ b/src/index/Permutation.h @@ -38,8 +38,6 @@ class Permutation { using ColumnIndices = CompressedRelationReader::ColumnIndices; using CancellationHandle = ad_utility::SharedCancellationHandle; - using ScanSpecification = CompressedRelationReader::ScanSpecification; - // Convert a permutation to the corresponding string, etc. `PSO` is converted // to "PSO". static std::string_view toString(Enum permutation); diff --git a/src/index/ScanSpecification.cpp b/src/index/ScanSpecification.cpp new file mode 100644 index 0000000000..505dc1e051 --- /dev/null +++ b/src/index/ScanSpecification.cpp @@ -0,0 +1,89 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "index/ScanSpecification.h" + +#include "index/Index.h" +#include "index/IndexImpl.h" + +// ____________________________________________________________________________ +std::optional +ScanSpecificationAsTripleComponent::toScanSpecification( + const Index& index) const { + return toScanSpecification(index.getImpl()); +} + +// ____________________________________________________________________________ +std::optional +ScanSpecificationAsTripleComponent::toScanSpecification( + const IndexImpl& index) const { + // TODO Use `std::optional::transform`. + // TODO: We can also have LocalVocab entries is the + // ScanSpecification. + bool nonexistingVocabEntryFound = false; + auto getId = + [&index, &nonexistingVocabEntryFound]( + const std::optional& tc) -> std::optional { + if (!tc.has_value()) { + return std::nullopt; + } + auto id = tc.value().toValueId(index.getVocab()); + if (!id.has_value()) { + nonexistingVocabEntryFound = true; + } + return id; + }; + std::optional col0Id = getId(col0_); + std::optional col1Id = getId(col1_); + std::optional col2Id = getId(col2_); + + if (nonexistingVocabEntryFound) { + return std::nullopt; + } + return ScanSpecification{col0Id, col1Id, col2Id}; +} + +// ____________________________________________________________________________ +ScanSpecificationAsTripleComponent::ScanSpecificationAsTripleComponent(T col0, + T col1, + T col2) { + auto toNulloptIfVariable = [](T& tc) -> std::optional { + if (tc.has_value() && tc.value().isVariable()) { + return std::nullopt; + } else { + return std::move(tc); + } + }; + col0_ = toNulloptIfVariable(col0); + col1_ = toNulloptIfVariable(col1); + col2_ = toNulloptIfVariable(col2); + + if (!col0_.has_value()) { + AD_CONTRACT_CHECK(!col1_.has_value()); + } + if (!col1_.has_value()) { + AD_CONTRACT_CHECK(!col2_.has_value()); + } +} + +// ____________________________________________________________________________ +size_t ScanSpecificationAsTripleComponent::numColumns() const { + auto i = [](const auto& x) -> size_t { + return static_cast(x.has_value()); + }; + return 3 - i(col0_) - i(col1_) - i(col2_); +} + +// _____________________________________________________________________________ +void ScanSpecification::validate() const { + bool c0 = col0Id_.has_value(); + bool c1 = col1Id_.has_value(); + bool c2 = col2Id_.has_value(); + if (!c0) { + AD_CORRECTNESS_CHECK(!c1 && !c2); + } + if (!c1) { + AD_CORRECTNESS_CHECK(!c2); + } +} diff --git a/src/index/ScanSpecification.h b/src/index/ScanSpecification.h new file mode 100644 index 0000000000..de8e915cee --- /dev/null +++ b/src/index/ScanSpecification.h @@ -0,0 +1,81 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once +#include + +#include "engine/LocalVocab.h" +#include "global/Id.h" +#include "parser/TripleComponent.h" + +// Forward declaration +class IndexImpl; +class Index; + +// The specification of a scan operation for a given permutation. +// Can either be a full scan (all three elements are `std::nullopt`), +// a scan for a fixed `col0Id`, a scan for a fixed `col0Id` and `col1Id`, +// or even a scan for a single triple to check whether it is contained in +// the knowledge graph at all. The values which are `nullopt` become variables +// and are returned as columns in the result of the scan. +class ScanSpecification { + private: + using T = std::optional; + + T col0Id_; + T col1Id_; + T col2Id_; + friend class ScanSpecificationAsTripleComponent; + + void validate() const; + + public: + ScanSpecification(T col0Id, T col1Id, T col2Id) + : col0Id_{col0Id}, col1Id_{col1Id}, col2Id_{col2Id} { + validate(); + } + const T& col0Id() const { return col0Id_; } + const T& col1Id() const { return col1Id_; } + const T& col2Id() const { return col2Id_; } + + bool operator==(const ScanSpecification&) const = default; + + // Only used in tests. + void setCol1Id(T col1Id) { + col1Id_ = col1Id; + validate(); + } +}; + +// Same as `ScanSpecification` (see above), but stores `TripleComponent`s +// instead of `Id`s. +class ScanSpecificationAsTripleComponent { + using T = std::optional; + + private: + std::optional col0_; + std::optional col1_; + std::optional col2_; + + public: + // Construct from three optional `TripleComponent`s. If any of the three + // entries is unbound (`nullopt` or of type `Variable`), then all subsequent + // entries also have to be unbound. For example if `col0` is bound, but `col1` + // isn't, then `col2` also has to be unbound. + ScanSpecificationAsTripleComponent(T col0, T col1, T col2); + + // Convert to a `ScanSpecification`. The `index` is used to convert the + // `TripleComponent` to `Id`s by looking them up in the vocabulary. Return + // `nullopt` if and only if one of the vocab lookup fails (then the result of + // the corresponding scan will be empty). + // TODO Once we implement SPARQL UPDATE, we possibly also to use the + // `LocalVocab` of the UPDATE triples here. + std::optional toScanSpecification( + const IndexImpl& index) const; + std::optional toScanSpecification( + const Index& index) const; + + // The number of columns that the corresponding index scan will have. + size_t numColumns() const; +}; diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index 29ca836c71..8b1efdcb1e 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -25,5 +25,5 @@ add_library(parser Iri.cpp Literal.cpp LiteralOrIri.cpp) -qlever_target_link_libraries(parser sparqlParser parserData sparqlExpressions rdfEscaping re2::re2 util engine) +qlever_target_link_libraries(parser sparqlParser parserData sparqlExpressions rdfEscaping re2::re2 util engine index) diff --git a/src/util/ConcurrentCache.h b/src/util/ConcurrentCache.h index e76e2980b7..de8d0f270e 100644 --- a/src/util/ConcurrentCache.h +++ b/src/util/ConcurrentCache.h @@ -206,7 +206,7 @@ class ConcurrentCache { } /// Clear the cache, including the pinned entries. - virtual void clearAll() { _cacheAndInProgressMap.wlock()->_cache.clearAll(); } + void clearAll() { _cacheAndInProgressMap.wlock()->_cache.clearAll(); } /// Delete elements from the unpinned part of the cache of total size /// at least `size`; diff --git a/test/CompressedRelationsTest.cpp b/test/CompressedRelationsTest.cpp index c2aad9d7c6..24faf16c2e 100644 --- a/test/CompressedRelationsTest.cpp +++ b/test/CompressedRelationsTest.cpp @@ -174,8 +174,7 @@ void testCompressedRelations(const auto& inputs, std::string testCaseName, ASSERT_FLOAT_EQ(m.numRows_ / static_cast(i + 1), m.multiplicityCol1_); // Scan for all distinct `col0` and check that we get the expected result. - CompressedRelationReader::ScanSpecification scanSpec{ - metaData[i].col0Id_, std::nullopt, std::nullopt}; + ScanSpecification scanSpec{metaData[i].col0Id_, std::nullopt, std::nullopt}; IdTable table = reader.scan(scanSpec, blocks, additionalColumns, cancellationHandle); const auto& col1And2 = inputs[i].col1And2_; @@ -207,8 +206,8 @@ void testCompressedRelations(const auto& inputs, std::string testCaseName, std::vector> col3; auto scanAndCheck = [&]() { - CompressedRelationReader::ScanSpecification scanSpec{ - metaData[i].col0Id_, V(lastCol1Id), std::nullopt}; + ScanSpecification scanSpec{metaData[i].col0Id_, V(lastCol1Id), + std::nullopt}; auto size = reader.getResultSizeOfScan(scanSpec, blocks); IdTable tableWidthOne = reader.scan(scanSpec, blocks, Permutation::ColumnIndicesRef{}, diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index df7421dbff..685edfc88a 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -37,7 +37,7 @@ auto makeTestScanWidthOne = [](const IndexImpl& index) { ad_utility::source_location::current()) { auto t = generateLocationTrace(l); IdTable result = - index.scan(c0, std::cref(c1), permutation, additionalColumns, + index.scan({c0, c1, std::nullopt}, permutation, additionalColumns, std::make_shared>()); ASSERT_EQ(result.numColumns(), 1 + additionalColumns.size()); ASSERT_EQ(result, makeIdTableFromVector(expected)); @@ -53,9 +53,10 @@ auto makeTestScanWidthTwo = [](const IndexImpl& index) { ad_utility::source_location l = ad_utility::source_location::current()) { auto t = generateLocationTrace(l); - IdTable wol = index.scan( - c0, std::nullopt, permutation, Permutation::ColumnIndicesRef{}, - std::make_shared>()); + IdTable wol = + index.scan({c0, std::nullopt, std::nullopt}, permutation, + Permutation::ColumnIndicesRef{}, + std::make_shared>()); ASSERT_EQ(wol, makeIdTableFromVector(expected)); }; }; diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 42f98286f2..de3f4f3d40 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -17,7 +17,7 @@ constexpr auto iri = ad_utility::testing::iri; using ::testing::HasSubstr; QueryPlanner makeQueryPlanner() { - return QueryPlanner{nullptr, + return QueryPlanner{ad_utility::testing::getQec(), std::make_shared>()}; } @@ -204,6 +204,20 @@ TEST(QueryPlanner, testBFSLeaveOut) { } } +TEST(QueryPlanner, indexScanZeroVariables) { + auto scan = h::IndexScanFromStrings; + using enum Permutation::Enum; + h::expect( + "SELECT * \n " + "WHERE \t { }", + scan("", "", "")); + h::expect( + "SELECT * \n " + "WHERE \t { . ?z}", + h::CartesianProductJoin(scan("", "", ""), + scan("", "", "?z"))); +} + TEST(QueryPlanner, indexScanOneVariable) { auto scan = h::IndexScanFromStrings; using enum Permutation::Enum; @@ -287,31 +301,28 @@ TEST(QueryPlanner, testStarTwoFree) { } TEST(QueryPlanner, testFilterAfterSeed) { - ParsedQuery pq = SparqlParser::parseQuery( + auto scan = h::IndexScanFromStrings; + auto qec = ad_utility::testing::getQec( + " , , . , , ."); + h::expect( "SELECT ?x ?y ?z WHERE {" "?x ?y . ?y ?z . " - "FILTER(?x != ?y) }"); - QueryPlanner qp = makeQueryPlanner(); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ(qet.getCacheKey(), - "FILTER JOIN\nSCAN POS with P = \"\" join-column: " - "[0]\n|X|\nSCAN PSO with P = \"\" join-column: [0] with " - "N16sparqlExpression10relational20RelationalExpressionILN18valueIdC" - "omparators10ComparisonE3EEE#column_1##column_0#"); + "FILTER(?x != ?y) }", + h::Filter("?x != ?y", + h::Join(scan("?x", "", "?y"), scan("?y", "", "?z"))), + qec); } TEST(QueryPlanner, testFilterAfterJoin) { - ParsedQuery pq = SparqlParser::parseQuery( + auto scan = h::IndexScanFromStrings; + auto qec = ad_utility::testing::getQec(" "); + h::expect( "SELECT ?x ?y ?z WHERE {" "?x ?y . ?y ?z . " - "FILTER(?x != ?z) }"); - QueryPlanner qp = makeQueryPlanner(); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ(qet.getCacheKey(), - "FILTER JOIN\nSCAN POS with P = \"\" join-column: " - "[0]\n|X|\nSCAN PSO with P = \"\" join-column: [0] with " - "N16sparqlExpression10relational20RelationalExpressionILN18valueIdC" - "omparators10ComparisonE3EEE#column_1##column_2#"); + "FILTER(?x != ?z) }", + h::Filter("?x != ?z", + h::Join(scan("?x", "", "?y"), scan("?y", "", "?z"))), + qec); } TEST(QueryPlanner, threeVarTriples) { @@ -565,25 +576,18 @@ TEST(QueryExecutionTreeTest, testFormerSegfaultTriFilter) { } TEST(QueryPlanner, testSimpleOptional) { - QueryPlanner qp = makeQueryPlanner(); - - ParsedQuery pq = SparqlParser::parseQuery( + auto scan = h::IndexScanFromStrings; + h::expect( "SELECT ?a ?b \n " - "WHERE {?a ?b . OPTIONAL { ?a ?c }}"); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ(qet.getCacheKey(), - "OPTIONAL_JOIN\nSCAN PSO with P = \"\" join-columns: " - "[0]\n|X|\nSCAN PSO with P = \"\" join-columns: [0]"); - - ParsedQuery pq2 = SparqlParser::parseQuery( + "WHERE {?a ?b . OPTIONAL { ?a ?c }}", + h::OptionalJoin(scan("?a", "", "?b"), scan("?a", "", "?c"))); + h::expect( "SELECT ?a ?b \n " "WHERE {?a ?b . " - "OPTIONAL { ?a ?c }} ORDER BY ?b"); - QueryExecutionTree qet2 = qp.createExecutionTree(pq2); - ASSERT_EQ(qet2.getCacheKey(), - "ORDER BY on columns:asc(1) \nOPTIONAL_JOIN\nSCAN PSO with P = " - "\"\" join-columns: [0]\n|X|\nSCAN PSO with P = \"\" " - "join-columns: [0]"); + "OPTIONAL { ?a ?c }} ORDER BY ?b", + h::OrderBy({{Variable{"?b"}, ::OrderBy::AscOrDesc::Asc}}, + h::OptionalJoin(scan("?a", "", "?b"), + scan("?a", "", "?c")))); } TEST(QueryPlanner, SimpleTripleOneVariable) { diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index c33f9a2faa..37c48f3124 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -16,6 +16,7 @@ #include "engine/Join.h" #include "engine/MultiColumnJoin.h" #include "engine/NeutralElementOperation.h" +#include "engine/OptionalJoin.h" #include "engine/OrderBy.h" #include "engine/QueryExecutionTree.h" #include "engine/QueryPlanner.h" @@ -204,6 +205,8 @@ inline auto IndexScanFromStrings = inline auto MultiColumnJoin = MatchTypeAndUnorderedChildren<::MultiColumnJoin>; inline auto Join = MatchTypeAndUnorderedChildren<::Join>; +constexpr auto OptionalJoin = MatchTypeAndOrderedChildren<::OptionalJoin>; + // Return a matcher that matches a query execution tree that consists of // multiple JOIN operations that join the `children`. The `INTERNAL SORT BY` // operations required for the joins are also ignored by this matcher. diff --git a/test/TriplesViewTest.cpp b/test/TriplesViewTest.cpp index 4c9fd53825..956100991c 100644 --- a/test/TriplesViewTest.cpp +++ b/test/TriplesViewTest.cpp @@ -30,7 +30,7 @@ struct DummyPermutation { } cppcoro::generator lazyScan( - CompressedRelationReader::ScanSpecification scanSpec, + ScanSpecification scanSpec, std::optional> blocks, std::span, const auto&) const { AD_CORRECTNESS_CHECK(!blocks.has_value()); diff --git a/test/ValuesForTestingTest.cpp b/test/ValuesForTestingTest.cpp index 95ce76ce46..8c4b86d019 100644 --- a/test/ValuesForTestingTest.cpp +++ b/test/ValuesForTestingTest.cpp @@ -25,9 +25,9 @@ TEST(ValuesForTesting, valuesForTesting) { ASSERT_EQ(v.getMultiplicity(0), 42.0); ASSERT_EQ(v.getMultiplicity(1), 84.0); - ASSERT_THAT( - v.getCacheKey(), - ::testing::StartsWith("Values for testing with 2 columns. V:3 V:12")); + ASSERT_THAT(v.getCacheKey(), + ::testing::StartsWith( + "Values for testing with 2 columns and 3 rows. V:3 V:12")); ASSERT_THAT(v.getCacheKey(), ::testing::EndsWith("Supports limit: 0")); ASSERT_EQ(v.getDescriptor(), "explicit values for testing"); ASSERT_TRUE(v.resultSortedOn().empty()); @@ -36,3 +36,13 @@ TEST(ValuesForTesting, valuesForTesting) { auto result = v.getResult(); ASSERT_EQ(result->idTable(), table); } + +// ____________________________________________________________________________ +TEST(ValuesForTesting, cornerCasesCacheKey) { + auto empty = makeIdTableFromVector({}); + auto neutral = makeIdTableFromVector({{}}); + + ValuesForTesting vEmpty{getQec(), empty.clone(), {}}; + ValuesForTesting vNeutral{getQec(), neutral.clone(), {}}; + EXPECT_NE(vEmpty.getCacheKey(), vNeutral.getCacheKey()); +} diff --git a/test/engine/CartesianProductJoinTest.cpp b/test/engine/CartesianProductJoinTest.cpp index f07bb11218..cd05e31b9b 100644 --- a/test/engine/CartesianProductJoinTest.cpp +++ b/test/engine/CartesianProductJoinTest.cpp @@ -90,9 +90,21 @@ void testCartesianProduct(VectorTable expected, std::vector inputs, TEST(CartesianProductJoin, computeResult) { // Simple base cases. VectorTable v{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}; + VectorTable empty{}; testCartesianProduct(v, {v}); - testCartesianProduct({}, {{}, v, {}}); - testCartesianProduct({}, {{}, {}}); + testCartesianProduct(empty, {empty, v, empty}); + testCartesianProduct(empty, {empty, empty}); + + // Test cases where some or all of the inputs are Neutral elements (1 row, + // zero columns) that are automatically filtered out by the + // `CartesianProductJoin`. + VectorTable neutral{{}}; + testCartesianProduct(neutral, {neutral}); + testCartesianProduct(v, {v, neutral}); + testCartesianProduct(v, {neutral, v, neutral}); + testCartesianProduct(neutral, {neutral, neutral, neutral}); + testCartesianProduct(empty, {neutral, empty, neutral}); + testCartesianProduct(empty, {neutral, empty, v}); // Fails because of an empty input. EXPECT_ANY_THROW(makeJoin({})); diff --git a/test/engine/IndexScanTest.cpp b/test/engine/IndexScanTest.cpp index 2920a2cd8e..abc5babd38 100644 --- a/test/engine/IndexScanTest.cpp +++ b/test/engine/IndexScanTest.cpp @@ -380,3 +380,67 @@ TEST(IndexScan, additionalColumn) { {{getId(""), getId(""), I(0), I(NO_PATTERN)}}); EXPECT_THAT(res.idTable(), ::testing::ElementsAreArray(exp)); } + +TEST(IndexScan, getResultSizeOfScan) { + auto qec = getQec("

, . ."); + auto getId = makeGetId(qec->getIndex()); + [[maybe_unused]] auto x = getId(""); + [[maybe_unused]] auto p = getId("

"); + [[maybe_unused]] auto s1 = getId(""); + [[maybe_unused]] auto s2 = getId(""); + [[maybe_unused]] auto p2 = getId(""); + using V = Variable; + using I = TripleComponent::Iri; + + { + SparqlTripleSimple scanTriple{V{"?x"}, V("?y"), V{"?z"}}; + IndexScan scan{qec, Permutation::Enum::PSO, scanTriple}; + // Note: this currently also contains the (internal) triple for the + // `ql:has-pattern` relation of ``. + EXPECT_EQ(scan.getSizeEstimate(), 4); + } + { + SparqlTripleSimple scanTriple{V{"?x"}, I::fromIriref("

"), V{"?y"}}; + IndexScan scan{qec, Permutation::Enum::PSO, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 2); + } + { + SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), + V{"?y"}}; + IndexScan scan{qec, Permutation::Enum::PSO, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 2); + } + { + SparqlTripleSimple scanTriple{V("?x"), I::fromIriref("

"), + I::fromIriref("")}; + IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 1); + } + // 0 variables + { + SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), + I::fromIriref("")}; + IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 1); + EXPECT_ANY_THROW(scan.getMultiplicity(0)); + auto res = scan.computeResultOnlyForTesting(); + ASSERT_EQ(res.idTable().numRows(), 1); + ASSERT_EQ(res.idTable().numColumns(), 0); + } + { + SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), + I::fromIriref("")}; + IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 0); + } + { + SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), + I::fromIriref("

")}; + IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 0); + EXPECT_ANY_THROW(scan.getMultiplicity(0)); + auto res = scan.computeResultOnlyForTesting(); + ASSERT_EQ(res.idTable().numRows(), 0); + ASSERT_EQ(res.idTable().numColumns(), 0); + } +} diff --git a/test/engine/ValuesForTesting.h b/test/engine/ValuesForTesting.h index dcbf130da8..ac7e363a95 100644 --- a/test/engine/ValuesForTesting.h +++ b/test/engine/ValuesForTesting.h @@ -65,7 +65,8 @@ class ValuesForTesting : public Operation { // ___________________________________________________________________________ string getCacheKeyImpl() const override { std::stringstream str; - str << "Values for testing with " << table_.numColumns() << " columns. "; + str << "Values for testing with " << table_.numColumns() << " columns and " + << table_.numRows() << " rows. "; if (table_.numRows() > 1000) { str << ad_utility::FastRandomIntGenerator{}(); } else { diff --git a/test/index/CMakeLists.txt b/test/index/CMakeLists.txt index aa0d0950e1..3651259d1a 100644 --- a/test/index/CMakeLists.txt +++ b/test/index/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(vocabulary) addLinkAndDiscoverTest(PatternCreatorTest index) +addLinkAndDiscoverTestSerial(ScanSpecificationTest index) diff --git a/test/index/ScanSpecificationTest.cpp b/test/index/ScanSpecificationTest.cpp new file mode 100644 index 0000000000..3c7df13755 --- /dev/null +++ b/test/index/ScanSpecificationTest.cpp @@ -0,0 +1,81 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "../util/GTestHelpers.h" +#include "../util/IndexTestHelpers.h" +#include "index/ScanSpecification.h" + +// _____________________________________________________________________________ +TEST(ScanSpecification, validate) { + Id i = Id::makeFromInt(42); + auto n = std::nullopt; + using S = ScanSpecification; + EXPECT_NO_THROW(S(i, i, i)); + EXPECT_NO_THROW(S(i, i, n)); + EXPECT_NO_THROW(S(i, n, n)); + EXPECT_NO_THROW(S(n, n, n)); + + EXPECT_ANY_THROW(S(n, i, i)); + EXPECT_ANY_THROW(S(n, n, i)); + EXPECT_ANY_THROW(S(n, i, n)); + EXPECT_ANY_THROW(S(i, n, i)); +} + +// _____________________________________________________________________________ +TEST(ScanSpecification, ScanSpecificationAsTripleComponent) { + Id i = Id::makeFromInt(42); + TripleComponent iTc{42}; + auto n = std::nullopt; + using S = ScanSpecification; + using STc = ScanSpecificationAsTripleComponent; + + EXPECT_ANY_THROW(STc(n, iTc, iTc)); + EXPECT_ANY_THROW(STc(n, n, iTc)); + EXPECT_ANY_THROW(STc(n, iTc, n)); + EXPECT_ANY_THROW(STc(iTc, n, iTc)); + + const auto& index = ad_utility::testing::getQec()->getIndex(); + auto toScanSpec = [&index](const STc& s) { + return s.toScanSpecification(index); + }; + + // Match that a `ScanSpecificationAsTripleComponent` has the expected number + // of columns, and yields the expected `ScanSpecification` when + // `toScanSpecification` is called on it. + auto matchScanSpec = + [&toScanSpec](const std::optional spec, + size_t numColumns = 0) -> ::testing::Matcher { + auto innerMatcher = [&toScanSpec, &spec] { + return ::testing::ResultOf(toScanSpec, ::testing::Eq(spec)); + }; + if (!spec.has_value()) { + return innerMatcher(); + } else { + return ::testing::AllOf( + innerMatcher(), + AD_PROPERTY(STc, numColumns, ::testing::Eq(numColumns))); + } + }; + EXPECT_THAT(STc(iTc, iTc, iTc), matchScanSpec(S(i, i, i), 0)); + EXPECT_THAT(STc(iTc, iTc, n), matchScanSpec(S(i, i, n), 1)); + EXPECT_THAT(STc(iTc, n, n), matchScanSpec(S(i, n, n), 2)); + EXPECT_THAT(STc(n, n, n), matchScanSpec(S(n, n, n), 3)); + + // Test the resolution of vocab entries. + auto getId = ad_utility::testing::makeGetId(index); + auto x = getId(""); + TripleComponent xIri = TripleComponent::Iri::fromIriref(""); + + EXPECT_THAT(STc(xIri, xIri, xIri), matchScanSpec(S(x, x, x), 0)); + + // For an entry that is not in the vocabulary, the complete result of + // `toScanSpecification` is `nullopt`. + TripleComponent notInVocab = + TripleComponent::Iri::fromIriref(""); + EXPECT_THAT(STc(notInVocab, xIri, xIri), matchScanSpec(std::nullopt)); + EXPECT_THAT(STc(xIri, notInVocab, xIri), matchScanSpec(std::nullopt)); + EXPECT_THAT(STc(xIri, xIri, notInVocab), matchScanSpec(std::nullopt)); +} diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index db4c89c228..9d208f70c5 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -60,8 +60,9 @@ void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( auto hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE); auto checkSingleElement = [&cancellationDummy, &hasPatternId]( const Index& index, size_t patternIdx, Id id) { - auto scanResultHasPattern = index.scan( - hasPatternId, id, Permutation::Enum::PSO, {}, cancellationDummy); + auto scanResultHasPattern = + index.scan({hasPatternId, id, std::nullopt}, Permutation::Enum::PSO, {}, + cancellationDummy); // Each ID has at most one pattern, it can have none if it doesn't // appear as a subject in the knowledge graph. AD_CORRECTNESS_CHECK(scanResultHasPattern.numRows() <= 1); @@ -79,7 +80,7 @@ void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( auto cancellationDummy = std::make_shared>(); auto scanResult = index.scan( - col0Id, std::nullopt, permutation, + {col0Id, std::nullopt, std::nullopt}, permutation, std::array{ColumnIndex{ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN}, ColumnIndex{ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}}, cancellationDummy);