Skip to content

Commit

Permalink
Fix a small bug in the pattern creation (#1396)
Browse files Browse the repository at this point in the history
There was a bug in the pattern creation when the first or last triple of a group of triples with the same subject was an "ignored" triple (where the predicate should not contribute towards the pattern). This is now fixed and tested and the code also became simpler.
  • Loading branch information
joka921 authored Jul 18, 2024
1 parent 49e4134 commit 2b8dd19
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 41 deletions.
87 changes: 50 additions & 37 deletions src/index/PatternCreator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,64 +10,77 @@ static const Id hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE);

// _________________________________________________________________________
void PatternCreator::processTriple(std::array<Id, 3> triple,
bool ignoreForPatterns) {
if (ignoreForPatterns) {
tripleBuffer_.emplace_back(triple, ignoreForPatterns);
return;
}
bool ignoreTripleForPatterns) {
if (!currentSubject_.has_value()) {
// This is the first triple.
// This is the very first triple.
currentSubject_ = triple[0];
} else if (triple[0] != currentSubject_) {
// New subject.
finishSubject(currentSubject_.value(), currentPattern_);
currentSubject_ = triple[0];
currentPattern_.clear();
}
tripleBuffer_.emplace_back(triple, ignoreForPatterns);
// Don't list predicates twice in the same pattern.
tripleBuffer_.emplace_back(triple, ignoreTripleForPatterns);
if (ignoreTripleForPatterns) {
return;
}
// Add predicate to pattern unless it was already added.
if (currentPattern_.empty() || currentPattern_.back() != triple[1]) {
currentPattern_.push_back(triple[1]);
}
}

// ________________________________________________________________________________
void PatternCreator::finishSubject(Id subject, const Pattern& pattern) {
numDistinctSubjects_++;
// _____________________________________________________________________________
PatternID PatternCreator::finishPattern(const Pattern& pattern) {
if (pattern.empty()) {
return NO_PATTERN;
}
numDistinctSubjectPredicatePairs_ += pattern.size();
PatternID patternId;
auto it = patternToIdAndCount_.find(pattern);
if (it == patternToIdAndCount_.end()) {
// This is a new pattern, assign a new pattern ID and a count of 1.
patternId = static_cast<PatternID>(patternToIdAndCount_.size());
patternToIdAndCount_[pattern] = PatternIdAndCount{patternId, 1UL};

// Count the total number of distinct predicates that appear in the
// pattern and have not been counted before.
for (auto predicate : pattern) {
distinctPredicates_.insert(predicate);
}
} else {
if (it != patternToIdAndCount_.end()) {
// We have already seen the same pattern for a previous subject ID, reuse
// the ID and increase the count.
patternId = it->second.patternId_;
it->second.count_++;
return it->second.patternId_;
}
// This is a new pattern, assign a new pattern ID and a count of 1.
auto patternId = static_cast<PatternID>(patternToIdAndCount_.size());
patternToIdAndCount_[pattern] = PatternIdAndCount{patternId, 1UL};

// Count the total number of distinct predicates that appear in the
// pattern and have not been counted before.
for (auto predicate : pattern) {
distinctPredicates_.insert(predicate);
}
return patternId;
}

auto additionalTriple =
std::array{subject, hasPatternId, Id::makeFromInt(patternId)};
tripleSorter_.hasPatternPredicateSortedByPSO_->push(additionalTriple);
// ________________________________________________________________________________
void PatternCreator::finishSubject(Id subject, const Pattern& pattern) {
// Write the pattern to disk and obtain its ID.
PatternID patternId = finishPattern(pattern);

// Write the triple `<subject> ql:has-pattern <patternId>`, but only if the
// subject has a pattern.
if (!pattern.empty()) {
auto additionalTriple =
std::array{subject, hasPatternId, Id::makeFromInt(patternId)};
tripleSorter_.hasPatternPredicateSortedByPSO_->push(additionalTriple);
++numDistinctSubjects_;
}

// Write the quads `<subject> <predicate> <object> <patternOfSubject>.
// Note: This has to be done for all triples, including those where the
// subject has no pattern.
auto curSubject = currentSubject_.value();
std::ranges::for_each(tripleBuffer_, [this, patternId,
&curSubject](const auto& t) {
const auto& [s, p, o] = t.triple_;
// It might happen that the `tripleBuffer_` contains different subjects
// which are purely internal and therefore have no pattern.
auto actualPatternId =
Id::makeFromInt(curSubject != s ? NO_PATTERN : patternId);
AD_CORRECTNESS_CHECK(curSubject == s || t.isInternal_);
ospSorterTriplesWithPattern().push(std::array{s, p, o, actualPatternId});
});
std::ranges::for_each(tripleBuffer_,
[this, patternId, &curSubject](const auto& t) {
const auto& [s, p, o] = t.triple_;
AD_CORRECTNESS_CHECK(s == curSubject);
ospSorterTriplesWithPattern().push(
std::array{s, p, o, Id::makeFromInt(patternId)});
});

tripleBuffer_.clear();
}

Expand Down
3 changes: 2 additions & 1 deletion src/index/PatternCreator.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ class PatternCreator {
// This function has to be called for all the triples in the SPO permutation
// The `triple` must be >= all previously pushed triples wrt the SPO
// permutation.
void processTriple(std::array<Id, 3> triple, bool ignoreForPatterns);
void processTriple(std::array<Id, 3> triple, bool ignoreTripleForPatterns);

// Write the patterns to disk after all triples have been pushed. Calls to
// `processTriple` after calling `finish` lead to undefined behavior. Note
Expand Down Expand Up @@ -175,6 +175,7 @@ class PatternCreator {

private:
void finishSubject(Id subject, const Pattern& pattern);
PatternID finishPattern(const Pattern& pattern);

void printStatistics(PatternStatistics patternStatistics) const;

Expand Down
11 changes: 8 additions & 3 deletions test/index/PatternCreatorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,16 @@ auto createExamplePatterns(PatternCreator& creator) {
expected.push_back(A{triple[0], triple[1], triple[2], I(patternIdx)});
};

// The first subject gets the first pattern.
// The first subject gets the first pattern. We have an ignored triple at the
// end which doesn't count towards the pattern.
push({V(0), V(10), V(20)}, false, 0);
push({V(0), V(10), V(21)}, false, 0);
push({V(0), V(11), V(18)}, false, 0);
push({V(0), V(12), V(18)}, true, 0);

// New subject, different predicates, so a new pattern
// New subject, different predicates, so a new pattern.
push({V(1), V(10), V(18)}, false, 1);
// ignored triple, but `V(1)` has other non-ignored triple, so it will have a
// Ignored triple, but `V(1)` has other non-ignored triple, so it will have a
// pattern, but `V(11)` will not contribute to that pattern.
push({V(1), V(11), V(18)}, true, 1);
push({V(1), V(12), V(18)}, false, 1);
Expand All @@ -88,6 +90,9 @@ auto createExamplePatterns(PatternCreator& creator) {
push({V(2), V(14), V(18)}, true, NO_PATTERN);

// New subject, but has the same predicate and therefore patterns as `V(0)`.
// We have an ignored triple at the beginning, which doesn't count towards
// the pattern.
push({V(3), V(9), V(18)}, true, 0);
push({V(3), V(10), V(28)}, false, 0);
push({V(3), V(11), V(29)}, false, 0);
push({V(3), V(11), V(45)}, false, 0);
Expand Down

0 comments on commit 2b8dd19

Please sign in to comment.