The unit tests pass again.

Check for code coverage/self-review/etc. Signed-off-by: Johannes Kalmbach <[email protected]>
ad-freiburg · Oct 31, 2024 · f0caa80 · f0caa80
1 parent 1435b1c
commit f0caa80
Show file tree

Hide file tree

Showing 9 changed files with 207 additions and 66 deletions.
diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp
@@ -34,7 +34,7 @@ IndexScan::IndexScan(QueryExecutionContext* qec, Permutation::Enum permutation,
     additionalColumns_.push_back(idx);
     additionalVariables_.push_back(variable);
   }
-  sizeEstimate_ = computeSizeEstimate();
+  std::tie(sizeEstimateIsExact_, sizeEstimate_) = computeSizeEstimate();
 
   // Check the following invariant: All the variables must be at the end of the
   // permuted triple. For example in the PSO permutation, either only the O, or
@@ -171,7 +171,18 @@ ProtoResult IndexScan::computeResult(bool requestLaziness) {
 }
 
 // _____________________________________________________________________________
-size_t IndexScan::computeSizeEstimate() const {
+std::pair<bool, size_t> IndexScan::computeSizeEstimate() const {
+  AD_CORRECTNESS_CHECK(_executionContext);
+  auto [lower, upper] =
+      getIndex()
+          .getImpl()
+          .getPermutation(permutation())
+          .getSizeEstimateForScan(getScanSpecification(), deltaTriples());
+  return {lower == upper, std::midpoint(lower, upper)};
+}
+
+// _____________________________________________________________________________
+size_t IndexScan::getExactSize() const {
   AD_CORRECTNESS_CHECK(_executionContext);
   return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_,
                                         deltaTriples());

diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h
@@ -20,6 +20,7 @@ class IndexScan final : public Operation {
   Graphs graphsToFilter_;
   size_t numVariables_;
   size_t sizeEstimate_;
+  bool sizeEstimateIsExact_;
   vector<float> multiplicity_;
 
   // Additional columns (e.g. patterns) that are being retrieved in addition to
@@ -59,7 +60,7 @@ class IndexScan final : public Operation {
 
   // Return the exact result size of the index scan. This is always known as it
   // can be read from the Metadata.
-  size_t getExactSize() const { return sizeEstimate_; }
+  size_t getExactSize() const;
 
   // Return two generators that lazily yield the results of `s1` and `s2` in
   // blocks, but only the blocks that can theoretically contain matching rows
@@ -78,7 +79,7 @@ class IndexScan final : public Operation {
  private:
   // TODO<joka921> Make the `getSizeEstimateBeforeLimit()` function `const` for
   // ALL the `Operations`.
-  uint64_t getSizeEstimateBeforeLimit() override { return getExactSize(); }
+  uint64_t getSizeEstimateBeforeLimit() override { return sizeEstimate_; }
 
  public:
   size_t getCostEstimate() override;
@@ -93,7 +94,9 @@ class IndexScan final : public Operation {
     return multiplicity_[col];
   }
 
-  bool knownEmptyResult() override { return getExactSize() == 0; }
+  bool knownEmptyResult() override {
+    return sizeEstimateIsExact_ && sizeEstimate_ == 0;
+  }
 
   bool isIndexScanWithNumVariables(size_t target) const override {
     return numVariables() == target;
@@ -103,7 +106,7 @@ class IndexScan final : public Operation {
   // size of wikidata, so we don't even need to try and waste performance.
   bool unlikelyToFitInCache(
       ad_utility::MemorySize maxCacheableSize) const override {
-    return ad_utility::MemorySize::bytes(getExactSize() * getResultWidth() *
+    return ad_utility::MemorySize::bytes(sizeEstimate_ * getResultWidth() *
                                          sizeof(Id)) > maxCacheableSize;
   }
 
@@ -124,7 +127,7 @@ class IndexScan final : public Operation {
 
   vector<QueryExecutionTree*> getChildren() override { return {}; }
 
-  size_t computeSizeEstimate() const;
+  std::pair<bool, size_t> computeSizeEstimate() const;
 
   std::string getCacheKeyImpl() const override;
 

diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp
@@ -635,8 +635,6 @@ DecompressedBlock CompressedRelationReader::readPossiblyIncompleteBlock(
   return result;
 };
 
-/*
-// ____________________________________________________________________________
 template <bool exactSize>
 std::pair<size_t, size_t> CompressedRelationReader::getResultSizeImpl(
     const ScanSpecification& scanSpec,
@@ -647,17 +645,16 @@ std::pair<size_t, size_t> CompressedRelationReader::getResultSizeImpl(
   // col1Id
   auto relevantBlocks = getRelevantBlocks(scanSpec, blocks);
   auto [beginBlock, endBlock] = getBeginAndEnd(relevantBlocks);
-  // TODO<joka921> The exact size is also wrong as soon as we have GRAPHS
-  // involved.
-  std::array<ColumnIndex, 1> columnIndices{0u};
+
+  auto config = getScanConfig(scanSpec, {}, locatedTriplesPerBlock);
 
   // The first and the last block might be incomplete (that is, only
   // a part of these blocks is actually part of the result,
   // set up a lambda which allows us to read these blocks, and returns
   // the size of the result.
   auto readSizeOfPossiblyIncompleteBlock = [&](const auto& block) {
-    return readPossiblyIncompleteBlock(scanSpec, block, std::nullopt,
-                                       columnIndices, locatedTriplesPerBlock)
+    return readPossiblyIncompleteBlock(scanSpec, config, block, std::nullopt,
+                                       locatedTriplesPerBlock)
         .numRows();
   };
 
@@ -674,47 +671,52 @@ std::pair<size_t, size_t> CompressedRelationReader::getResultSizeImpl(
   }
 
   if (beginBlock == endBlock) {
-    return { numResults, numResults };
+    return {numResults, numResults};
   }
 
-  // TODO<joka921> There are a lot of bugs here when it comes to graph columns.
-  // In particular the prepareColumnIndices should directly add the graph column
-  // if necessary etc.
-  auto allColumns = prepareColumnIndices(scanSpec, {});
-  allColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID);
-  auto locatedTriples = prepareLocatedTriples(allColumns,
-locatedTriplesPerBlock);
-
   // Determine the total size of the result.
   // First accumulate the complete blocks in the "middle"
   // TODO<joka921> If the block contains  added or deleted triples, we
   // actually have to read/materialize them to get the correct size.
-  std::size_t fromIndex;
-  std::size_t inserted;
-  std::size_t deleted;
+  std::size_t inserted = 0;
+  std::size_t deleted = 0;
   std::ranges::for_each(
       std::ranges::subrange{beginBlock, endBlock}, [&](const auto& block) {
         const auto [ins, del] =
             locatedTriplesPerBlock.numTriples(block.blockIndex_);
         if (!exactSize || (ins == 0 && del == 0)) {
           inserted += ins;
           deleted += del;
-          fromIndex += block.numRows_;
+          numResults += block.numRows_;
         } else {
-          fromIndex += readAndDecompressBlock(block, allColumns,
-locatedTriples).numRows();
+          // TODO<joka921> We could cache the exact size as soon as we have
+          // merged the block once since the last update.
+          auto b = readAndDecompressBlock(block, config);
+          numResults += b.has_value() ? b.value().numRows() : 0u;
         }
       });
-  return {fromIndex - std::max(deleted, fromIndex), fromIndex + inserted};
+  return {numResults - std::min(deleted, numResults), numResults + inserted};
+}
+
+// ____________________________________________________________________________
+std::pair<size_t, size_t> CompressedRelationReader::getSizeEstimateForScan(
+    const ScanSpecification& scanSpec,
+    const vector<CompressedBlockMetadata>& blocks,
+    const LocatedTriplesPerBlock& locatedTriplesPerBlock) const {
+  return getResultSizeImpl<false>(scanSpec, blocks, locatedTriplesPerBlock);
 }
- */
 
 // ____________________________________________________________________________
 size_t CompressedRelationReader::getResultSizeOfScan(
     const ScanSpecification& scanSpec,
     const vector<CompressedBlockMetadata>& blocks,
     [[maybe_unused]] const LocatedTriplesPerBlock& locatedTriplesPerBlock)
     const {
+  auto [lower, upper] =
+      getResultSizeImpl<true>(scanSpec, blocks, locatedTriplesPerBlock);
+  AD_CORRECTNESS_CHECK(lower == upper);
+  return lower;
+  /*
   // Get all the blocks  that possibly might contain our pair of col0Id and
   // col1Id
   auto relevantBlocks = getRelevantBlocks(scanSpec, blocks);
@@ -752,6 +754,7 @@ size_t CompressedRelationReader::getResultSizeOfScan(
                                   return count + block.numRows_;
                                 });
   return numResults;
+   */
 }
 
 // ____________________________________________________________________________

diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h
@@ -580,7 +580,20 @@ class CompressedRelationReader {
       const ScanSpecification& scanSpec,
       const vector<CompressedBlockMetadata>& blocks,
       const LocatedTriplesPerBlock& locatedTriplesPerBlock) const;
+  std::pair<size_t, size_t> getSizeEstimateForScan(
+      const ScanSpecification& scanSpec,
+      const vector<CompressedBlockMetadata>& blocks,
+      const LocatedTriplesPerBlock& locatedTriplesPerBlock) const;
+
+ private:
+  template <bool exactSize>
+  std::pair<size_t, size_t> getResultSizeImpl(
+      const ScanSpecification& scanSpec,
+      const vector<CompressedBlockMetadata>& blocks,
+      [[maybe_unused]] const LocatedTriplesPerBlock& locatedTriplesPerBlock)
+      const;
 
+ public:
   // For a given relation, determine the `col1Id`s and their counts. This is
   // used for `computeGroupByObjectWithCount`.
   IdTable getDistinctCol1IdsAndCounts(

diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp
@@ -268,6 +268,27 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() {
     }
     blockIndex++;
   }
+  // Also account for the last block that contains the triples that are larger
+  // than all the inserted triples.
+  if (hasUpdates(blockIndex)) {
+    const auto& blockUpdates = map_.at(blockIndex);
+    auto firstTriple = blockUpdates.begin()->triple_.toPermutedTriple();
+    auto lastTriple = blockUpdates.rbegin()->triple_.toPermutedTriple();
+
+    using O = CompressedBlockMetadata::OffsetAndCompressedSize;
+    O emptyBlock{0, 0};
+
+    // TODO<joka921> We need the appropriate number of columns here, or we need
+    // to make the reading code work regardless of the number of columns.
+    CompressedBlockMetadataNoBlockIndex lastBlockN{
+        std::vector<O>(4, emptyBlock),
+        0,
+        firstTriple,
+        lastTriple,
+        std::nullopt,
+        true};
+    augmentedMetadata_->emplace_back(lastBlockN, blockIndex);
+  }
 }
 
 // ____________________________________________________________________________

diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h
@@ -90,6 +90,8 @@ class LocatedTriplesPerBlock {
   // the updated triples.
   std::optional<std::vector<CompressedBlockMetadata>> augmentedMetadata_;
   std::vector<CompressedBlockMetadata> originalMetadata_;
+
+ public:
   void updateAugmentedMetadata();
 
  public:

diff --git a/src/index/Permutation.cpp b/src/index/Permutation.cpp
@@ -77,6 +77,14 @@ size_t Permutation::getResultSizeOfScan(
                                         locatedTriples(deltaTriples));
 }
 
+// _____________________________________________________________________
+std::pair<size_t, size_t> Permutation::getSizeEstimateForScan(
+    const ScanSpecification& scanSpec, const DeltaTriples& deltaTriples) const {
+  const auto& p = getActualPermutation(scanSpec);
+  return p.reader().getSizeEstimateForScan(scanSpec, p.meta_.blockData(),
+                                           locatedTriples(deltaTriples));
+}
+
 // ____________________________________________________________________________
 IdTable Permutation::getDistinctCol1IdsAndCounts(
     Id col0Id, const CancellationHandle& cancellationHandle,

diff --git a/src/index/Permutation.h b/src/index/Permutation.h
@@ -120,6 +120,9 @@ class Permutation {
   /// result
   size_t getResultSizeOfScan(const ScanSpecification& scanSpec,
                              const DeltaTriples& deltaTriples) const;
+  std::pair<size_t, size_t> getSizeEstimateForScan(
+      const ScanSpecification& scanSpec,
+      const DeltaTriples& deltaTriples) const;
 
   // _______________________________________________________
   void setKbName(const string& name) { meta_.setName(name); }