KNNIterators should support with and without filters (#2155)

* Rename class names to represent both and filter and non filter use cases * Iterator should support with filters Update VectorIterator and NesterVector Iterator to iterate even if there is no filters provided to iterator. Currently this is used by exact search to score either topk docs or all docs when filter is provided by users. However, in future we will be allowing exact search even if there are no filters. Hence, decouple filter and make it option to support both cases. --------- Signed-off-by: Vijayan Balasubramanian <[email protected]>
opensearch-project · Sep 28, 2024 · 6f6dd56 · 6f6dd56
1 parent e0c3afe
commit 6f6dd56
Show file tree

Hide file tree

Showing 13 changed files with 404 additions and 192 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 * Add short circuit if no live docs are in segments [#2059](https://github.com/opensearch-project/k-NN/pull/2059)
 * Optimize reduceToTopK in ResultUtil by removing pre-filling and reducing peek calls [#2146](https://github.com/opensearch-project/k-NN/pull/2146)
 * Update Default Rescore Context based on Dimension [#2149](https://github.com/opensearch-project/k-NN/pull/2149)
+* KNNIterators should support with and without filters [#2155](https://github.com/opensearch-project/k-NN/pull/2155)
 ### Bug Fixes
 * KNN80DocValues should only be considered for BinaryDocValues fields [#2147](https://github.com/opensearch-project/k-NN/pull/2147)
 ### Infrastructure

diff --git a/src/main/java/org/opensearch/knn/index/query/ExactSearcher.java b/src/main/java/org/opensearch/knn/index/query/ExactSearcher.java
@@ -20,11 +20,11 @@
 import org.opensearch.knn.common.FieldInfoExtractor;
 import org.opensearch.knn.index.SpaceType;
 import org.opensearch.knn.index.VectorDataType;
-import org.opensearch.knn.index.query.filtered.FilteredIdsKNNByteIterator;
-import org.opensearch.knn.index.query.filtered.FilteredIdsKNNIterator;
-import org.opensearch.knn.index.query.filtered.KNNIterator;
-import org.opensearch.knn.index.query.filtered.NestedFilteredIdsKNNByteIterator;
-import org.opensearch.knn.index.query.filtered.NestedFilteredIdsKNNIterator;
+import org.opensearch.knn.index.query.iterators.ByteVectorIdsKNNIterator;
+import org.opensearch.knn.index.query.iterators.VectorIdsKNNIterator;
+import org.opensearch.knn.index.query.iterators.KNNIterator;
+import org.opensearch.knn.index.query.iterators.NestedByteVectorIdsKNNIterator;
+import org.opensearch.knn.index.query.iterators.NestedVectorIdsKNNIterator;
 import org.opensearch.knn.index.vectorvalues.KNNBinaryVectorValues;
 import org.opensearch.knn.index.vectorvalues.KNNFloatVectorValues;
 import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
@@ -51,8 +51,9 @@ public class ExactSearcher {
      */
     public Map<Integer, Float> searchLeaf(final LeafReaderContext leafReaderContext, final ExactSearcherContext exactSearcherContext)
         throws IOException {
-        KNNIterator iterator = getMatchedKNNIterator(leafReaderContext, exactSearcherContext);
-        if (exactSearcherContext.getMatchedDocs().cardinality() <= exactSearcherContext.getK()) {
+        KNNIterator iterator = getKNNIterator(leafReaderContext, exactSearcherContext);
+        if (exactSearcherContext.getMatchedDocs() != null
+            && exactSearcherContext.getMatchedDocs().cardinality() <= exactSearcherContext.getK()) {
             return scoreAllDocs(iterator);
         }
         return searchTopK(iterator, exactSearcherContext.getK());
@@ -98,8 +99,7 @@ private Map<Integer, Float> searchTopK(KNNIterator iterator, int k) throws IOExc
         return docToScore;
     }
 
-    private KNNIterator getMatchedKNNIterator(LeafReaderContext leafReaderContext, ExactSearcherContext exactSearcherContext)
-        throws IOException {
+    private KNNIterator getKNNIterator(LeafReaderContext leafReaderContext, ExactSearcherContext exactSearcherContext) throws IOException {
         final KNNQuery knnQuery = exactSearcherContext.getKnnQuery();
         final BitSet matchedDocs = exactSearcherContext.getMatchedDocs();
         final SegmentReader reader = Lucene.segmentReader(leafReaderContext.reader());
@@ -108,20 +108,18 @@ private KNNIterator getMatchedKNNIterator(LeafReaderContext leafReaderContext, E
 
         boolean isNestedRequired = exactSearcherContext.isParentHits() && knnQuery.getParentsFilter() != null;
 
-        if (VectorDataType.BINARY == knnQuery.getVectorDataType() && isNestedRequired) {
-            final KNNVectorValues<byte[]> vectorValues = KNNVectorValuesFactory.getVectorValues(fieldInfo, reader);
-            return new NestedFilteredIdsKNNByteIterator(
-                matchedDocs,
-                knnQuery.getByteQueryVector(),
-                (KNNBinaryVectorValues) vectorValues,
-                spaceType,
-                knnQuery.getParentsFilter().getBitSet(leafReaderContext)
-            );
-        }
-
         if (VectorDataType.BINARY == knnQuery.getVectorDataType()) {
             final KNNVectorValues<byte[]> vectorValues = KNNVectorValuesFactory.getVectorValues(fieldInfo, reader);
-            return new FilteredIdsKNNByteIterator(
+            if (isNestedRequired) {
+                return new NestedByteVectorIdsKNNIterator(
+                    matchedDocs,
+                    knnQuery.getByteQueryVector(),
+                    (KNNBinaryVectorValues) vectorValues,
+                    spaceType,
+                    knnQuery.getParentsFilter().getBitSet(leafReaderContext)
+                );
+            }
+            return new ByteVectorIdsKNNIterator(
                 matchedDocs,
                 knnQuery.getByteQueryVector(),
                 (KNNBinaryVectorValues) vectorValues,
@@ -142,7 +140,7 @@ private KNNIterator getMatchedKNNIterator(LeafReaderContext leafReaderContext, E
 
         final KNNVectorValues<float[]> vectorValues = KNNVectorValuesFactory.getVectorValues(fieldInfo, reader);
         if (isNestedRequired) {
-            return new NestedFilteredIdsKNNIterator(
+            return new NestedVectorIdsKNNIterator(
                 matchedDocs,
                 knnQuery.getQueryVector(),
                 (KNNFloatVectorValues) vectorValues,
@@ -152,8 +150,7 @@ private KNNIterator getMatchedKNNIterator(LeafReaderContext leafReaderContext, E
                 segmentLevelQuantizationInfo
             );
         }
-
-        return new FilteredIdsKNNIterator(
+        return new VectorIdsKNNIterator(
             matchedDocs,
             knnQuery.getQueryVector(),
             (KNNFloatVectorValues) vectorValues,
@@ -180,7 +177,7 @@ public static class ExactSearcherContext {
         KNNQuery knnQuery;
         /**
          * whether the matchedDocs contains parent ids or child ids. This is relevant in the case of
-         * filtered nested search where the matchedDocs contain the parent ids and {@link NestedFilteredIdsKNNIterator}
+         * filtered nested search where the matchedDocs contain the parent ids and {@link NestedVectorIdsKNNIterator}
          * needs to be used.
          */
         boolean isParentHits;

diff --git a/.../filtered/FilteredIdsKNNByteIterator.java → ...y/iterators/ByteVectorIdsKNNIterator.java b/.../filtered/FilteredIdsKNNByteIterator.java → ...y/iterators/ByteVectorIdsKNNIterator.java
@@ -3,11 +3,12 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-package org.opensearch.knn.index.query.filtered;
+package org.opensearch.knn.index.query.iterators;
 
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.BitSet;
 import org.apache.lucene.util.BitSetIterator;
+import org.opensearch.common.Nullable;
 import org.opensearch.knn.index.SpaceType;
 import org.opensearch.knn.index.vectorvalues.KNNBinaryVectorValues;
 
@@ -17,30 +18,34 @@
  * Inspired by DiversifyingChildrenFloatKnnVectorQuery in lucene
  * https://github.com/apache/lucene/blob/7b8aece125aabff2823626d5b939abf4747f63a7/lucene/join/src/java/org/apache/lucene/search/join/DiversifyingChildrenFloatKnnVectorQuery.java#L162
  *
- * The class is used in KNNWeight to score filtered KNN field by iterating filterIdsArray.
+ * The class is used in KNNWeight to score all docs, but, it iterates over filterIdsArray if filter is provided
  */
-public class FilteredIdsKNNByteIterator implements KNNIterator {
-    // Array of doc ids to iterate
-    protected final BitSet filterIdsBitSet;
+public class ByteVectorIdsKNNIterator implements KNNIterator {
     protected final BitSetIterator bitSetIterator;
     protected final byte[] queryVector;
     protected final KNNBinaryVectorValues binaryVectorValues;
     protected final SpaceType spaceType;
     protected float currentScore = Float.NEGATIVE_INFINITY;
     protected int docId;
 
-    public FilteredIdsKNNByteIterator(
-        final BitSet filterIdsBitSet,
+    public ByteVectorIdsKNNIterator(
+        @Nullable final BitSet filterIdsBitSet,
         final byte[] queryVector,
         final KNNBinaryVectorValues binaryVectorValues,
         final SpaceType spaceType
-    ) {
-        this.filterIdsBitSet = filterIdsBitSet;
-        this.bitSetIterator = new BitSetIterator(filterIdsBitSet, filterIdsBitSet.length());
+    ) throws IOException {
+        this.bitSetIterator = filterIdsBitSet == null ? null : new BitSetIterator(filterIdsBitSet, filterIdsBitSet.length());
         this.queryVector = queryVector;
         this.binaryVectorValues = binaryVectorValues;
         this.spaceType = spaceType;
-        this.docId = bitSetIterator.nextDoc();
+        // This cannot be moved inside nextDoc() method since it will break when we have nested field, where
+        // nextDoc should already be referring to next knnVectorValues
+        this.docId = getNextDocId();
+    }
+
+    public ByteVectorIdsKNNIterator(final byte[] queryVector, final KNNBinaryVectorValues binaryVectorValues, final SpaceType spaceType)
+        throws IOException {
+        this(null, queryVector, binaryVectorValues, spaceType);
     }
 
     /**
@@ -55,10 +60,10 @@ public int nextDoc() throws IOException {
         if (docId == DocIdSetIterator.NO_MORE_DOCS) {
             return DocIdSetIterator.NO_MORE_DOCS;
         }
-        int doc = binaryVectorValues.advance(docId);
         currentScore = computeScore();
-        docId = bitSetIterator.nextDoc();
-        return doc;
+        int currentDocId = docId;
+        docId = getNextDocId();
+        return currentDocId;
     }
 
     @Override
@@ -72,4 +77,16 @@ protected float computeScore() throws IOException {
         // scores correspond to closer vectors.
         return spaceType.getKnnVectorSimilarityFunction().compare(queryVector, vector);
     }
+
+    protected int getNextDocId() throws IOException {
+        if (bitSetIterator == null) {
+            return binaryVectorValues.nextDoc();
+        }
+        int nextDocID = this.bitSetIterator.nextDoc();
+        // For filter case, advance vector values to corresponding doc id from filter bit set
+        if (nextDocID != DocIdSetIterator.NO_MORE_DOCS) {
+            binaryVectorValues.advance(nextDocID);
+        }
+        return nextDocID;
+    }
 }
diff --git a/...knn/index/query/filtered/KNNIterator.java → ...nn/index/query/iterators/KNNIterator.java b/...knn/index/query/filtered/KNNIterator.java → ...nn/index/query/iterators/KNNIterator.java
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-package org.opensearch.knn.index.query.filtered;
+package org.opensearch.knn.index.query.iterators;
 
 import java.io.IOException;
 

diff --git a/...red/NestedFilteredIdsKNNByteIterator.java → ...ators/NestedByteVectorIdsKNNIterator.java b/...red/NestedFilteredIdsKNNByteIterator.java → ...ators/NestedByteVectorIdsKNNIterator.java
@@ -3,33 +3,45 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-package org.opensearch.knn.index.query.filtered;
+package org.opensearch.knn.index.query.iterators;
 
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.BitSet;
+import org.opensearch.common.Nullable;
 import org.opensearch.knn.index.SpaceType;
 import org.opensearch.knn.index.vectorvalues.KNNBinaryVectorValues;
 
 import java.io.IOException;
 
 /**
- * This iterator iterates filterIdsArray to score. However, it dedupe docs per each parent doc
+ * This iterator iterates filterIdsArray to score if filter is provided else it iterates over all docs.
+ * However, it dedupe docs per each parent doc
  * of which ID is set in parentBitSet and only return best child doc with the highest score.
  */
-public class NestedFilteredIdsKNNByteIterator extends FilteredIdsKNNByteIterator {
+public class NestedByteVectorIdsKNNIterator extends ByteVectorIdsKNNIterator {
     private final BitSet parentBitSet;
 
-    public NestedFilteredIdsKNNByteIterator(
-        final BitSet filterIdsArray,
+    public NestedByteVectorIdsKNNIterator(
+        @Nullable final BitSet filterIdsArray,
         final byte[] queryVector,
         final KNNBinaryVectorValues binaryVectorValues,
         final SpaceType spaceType,
         final BitSet parentBitSet
-    ) {
+    ) throws IOException {
         super(filterIdsArray, queryVector, binaryVectorValues, spaceType);
         this.parentBitSet = parentBitSet;
     }
 
+    public NestedByteVectorIdsKNNIterator(
+        final byte[] queryVector,
+        final KNNBinaryVectorValues binaryVectorValues,
+        final SpaceType spaceType,
+        final BitSet parentBitSet
+    ) throws IOException {
+        super(null, queryVector, binaryVectorValues, spaceType);
+        this.parentBitSet = parentBitSet;
+    }
+
     /**
      * Advance to the next best child doc per parent and update score with the best score among child docs from the parent.
      * DocIdSetIterator.NO_MORE_DOCS is returned when there is no more docs
@@ -46,14 +58,18 @@ public int nextDoc() throws IOException {
         int currentParent = parentBitSet.nextSetBit(docId);
         int bestChild = -1;
 
+        // In order to traverse all children for given parent, we have to use docId < parentId, because,
+        // kNNVectorValues will not have parent id since DocId is unique per segment. For ex: let's say for doc id 1, there is one child
+        // and for doc id 5, there are three children. In that case knnVectorValues iterator will have [0, 2, 3, 4]
+        // and parentBitSet will have [1,5]
+        // Hence, we have to iterate till docId from knnVectorValues is less than parentId instead of till equal to parentId
         while (docId != DocIdSetIterator.NO_MORE_DOCS && docId < currentParent) {
-            binaryVectorValues.advance(docId);
             float score = computeScore();
             if (score > currentScore) {
                 bestChild = docId;
                 currentScore = score;
             }
-            docId = bitSetIterator.nextDoc();
+            docId = getNextDocId();
         }
 
         return bestChild;

diff --git a/...iltered/NestedFilteredIdsKNNIterator.java → ...iterators/NestedVectorIdsKNNIterator.java b/...iltered/NestedFilteredIdsKNNIterator.java → ...iterators/NestedVectorIdsKNNIterator.java
@@ -3,42 +3,53 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-package org.opensearch.knn.index.query.filtered;
+package org.opensearch.knn.index.query.iterators;
 
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.BitSet;
+import org.opensearch.common.Nullable;
 import org.opensearch.knn.index.SpaceType;
 import org.opensearch.knn.index.query.SegmentLevelQuantizationInfo;
 import org.opensearch.knn.index.vectorvalues.KNNFloatVectorValues;
 
 import java.io.IOException;
 
 /**
- * This iterator iterates filterIdsArray to score. However, it dedupe docs per each parent doc
+ * This iterator iterates filterIdsArray to score if filter is provided else it iterates over all docs.
+ * However, it dedupe docs per each parent doc
  * of which ID is set in parentBitSet and only return best child doc with the highest score.
  */
-public class NestedFilteredIdsKNNIterator extends FilteredIdsKNNIterator {
+public class NestedVectorIdsKNNIterator extends VectorIdsKNNIterator {
     private final BitSet parentBitSet;
 
-    NestedFilteredIdsKNNIterator(
-        final BitSet filterIdsArray,
+    public NestedVectorIdsKNNIterator(
+        @Nullable final BitSet filterIdsArray,
         final float[] queryVector,
         final KNNFloatVectorValues knnFloatVectorValues,
         final SpaceType spaceType,
         final BitSet parentBitSet
-    ) {
+    ) throws IOException {
         this(filterIdsArray, queryVector, knnFloatVectorValues, spaceType, parentBitSet, null, null);
     }
 
-    public NestedFilteredIdsKNNIterator(
-        final BitSet filterIdsArray,
+    public NestedVectorIdsKNNIterator(
+        final float[] queryVector,
+        final KNNFloatVectorValues knnFloatVectorValues,
+        final SpaceType spaceType,
+        final BitSet parentBitSet
+    ) throws IOException {
+        this(null, queryVector, knnFloatVectorValues, spaceType, parentBitSet, null, null);
+    }
+
+    public NestedVectorIdsKNNIterator(
+        @Nullable final BitSet filterIdsArray,
         final float[] queryVector,
         final KNNFloatVectorValues knnFloatVectorValues,
         final SpaceType spaceType,
         final BitSet parentBitSet,
         final byte[] quantizedVector,
         final SegmentLevelQuantizationInfo segmentLevelQuantizationInfo
-    ) {
+    ) throws IOException {
         super(filterIdsArray, queryVector, knnFloatVectorValues, spaceType, quantizedVector, segmentLevelQuantizationInfo);
         this.parentBitSet = parentBitSet;
     }
@@ -59,14 +70,18 @@ public int nextDoc() throws IOException {
         int currentParent = parentBitSet.nextSetBit(docId);
         int bestChild = -1;
 
+        // In order to traverse all children for given parent, we have to use docId < parentId, because,
+        // kNNVectorValues will not have parent id since DocId is unique per segment. For ex: let's say for doc id 1, there is one child
+        // and for doc id 5, there are three children. In that case knnVectorValues iterator will have [0, 2, 3, 4]
+        // and parentBitSet will have [1,5]
+        // Hence, we have to iterate till docId from knnVectorValues is less than parentId instead of till equal to parentId
         while (docId != DocIdSetIterator.NO_MORE_DOCS && docId < currentParent) {
-            knnFloatVectorValues.advance(docId);
             float score = computeScore();
             if (score > currentScore) {
                 bestChild = docId;
                 currentScore = score;
             }
-            docId = bitSetIterator.nextDoc();
+            docId = getNextDocId();
         }
 
         return bestChild;