diff --git a/CHANGELOG.md b/CHANGELOG.md index 4030b913f..5e6ef6558 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), * Update Default Rescore Context based on Dimension [#2149](https://github.com/opensearch-project/k-NN/pull/2149) * KNNIterators should support with and without filters [#2155](https://github.com/opensearch-project/k-NN/pull/2155) * Adding Support to Enable/Disble Share level Rescoring and Update Oversampling Factor[#2172](https://github.com/opensearch-project/k-NN/pull/2172) +* Add exact search if no native engine files are available [#2136] (https://github.com/opensearch-project/k-NN/pull/2136) ### Bug Fixes * KNN80DocValues should only be considered for BinaryDocValues fields [#2147](https://github.com/opensearch-project/k-NN/pull/2147) ### Infrastructure diff --git a/src/main/java/org/opensearch/knn/index/query/KNNWeight.java b/src/main/java/org/opensearch/knn/index/query/KNNWeight.java index 1c31ed725..d5cd80934 100644 --- a/src/main/java/org/opensearch/knn/index/query/KNNWeight.java +++ b/src/main/java/org/opensearch/knn/index/query/KNNWeight.java @@ -22,6 +22,7 @@ import org.apache.lucene.util.FixedBitSet; import org.opensearch.common.io.PathUtils; import org.opensearch.common.lucene.Lucene; +import org.opensearch.knn.common.FieldInfoExtractor; import org.opensearch.knn.common.KNNConstants; import org.opensearch.knn.index.KNNSettings; import org.opensearch.knn.index.SpaceType; @@ -33,6 +34,7 @@ import org.opensearch.knn.index.memory.NativeMemoryLoadStrategy; import org.opensearch.knn.index.engine.KNNEngine; import org.opensearch.knn.index.quantizationservice.QuantizationService; +import org.opensearch.knn.index.query.ExactSearcher.ExactSearcherContext.ExactSearcherContextBuilder; import org.opensearch.knn.indices.ModelDao; import org.opensearch.knn.indices.ModelMetadata; import org.opensearch.knn.indices.ModelUtil; @@ -129,42 +131,21 @@ public Map searchLeaf(LeafReaderContext context, int k) throws I if (filterWeight != null && cardinality == 0) { return Collections.emptyMap(); } - /* - * The idea for this optimization is to get K results, we need to atleast look at K vectors in the HNSW graph + * The idea for this optimization is to get K results, we need to at least look at K vectors in the HNSW graph * . Hence, if filtered results are less than K and filter query is present we should shift to exact search. * This improves the recall. */ - Map docIdsToScoreMap; - final ExactSearcher.ExactSearcherContext exactSearcherContext = ExactSearcher.ExactSearcherContext.builder() - .k(k) - .isParentHits(true) - .matchedDocs(filterBitSet) - // setting to true, so that if quantization details are present we want to do search on the quantized - // vectors as this flow is used in first pass of search. - .useQuantizedVectorsForSearch(true) - .knnQuery(knnQuery) - .build(); - if (filterWeight != null && canDoExactSearch(cardinality)) { - docIdsToScoreMap = exactSearch(context, exactSearcherContext); - } else { - docIdsToScoreMap = doANNSearch(context, filterBitSet, cardinality, k); - if (docIdsToScoreMap == null) { - return Collections.emptyMap(); - } - if (canDoExactSearchAfterANNSearch(cardinality, docIdsToScoreMap.size())) { - log.debug( - "Doing ExactSearch after doing ANNSearch as the number of documents returned are less than " - + "K, even when we have more than K filtered Ids. K: {}, ANNResults: {}, filteredIdCount: {}", - k, - docIdsToScoreMap.size(), - cardinality - ); - docIdsToScoreMap = exactSearch(context, exactSearcherContext); - } + if (isFilteredExactSearchPreferred(cardinality)) { + return doExactSearch(context, filterBitSet, k); } - if (docIdsToScoreMap.isEmpty()) { - return Collections.emptyMap(); + Map docIdsToScoreMap = doANNSearch(context, filterBitSet, cardinality, k); + // See whether we have to perform exact search based on approx search results + // This is required if there are no native engine files or if approximate search returned + // results less than K, though we have more than k filtered docs + if (isExactSearchRequire(context, cardinality, docIdsToScoreMap.size())) { + final BitSet docs = filterWeight != null ? filterBitSet : null; + return doExactSearch(context, docs, k); } return docIdsToScoreMap; } @@ -221,6 +202,20 @@ private int[] bitSetToIntArray(final BitSet bitSet) { return intArray; } + private Map doExactSearch(final LeafReaderContext context, final BitSet acceptedDocs, int k) throws IOException { + final ExactSearcherContextBuilder exactSearcherContextBuilder = ExactSearcher.ExactSearcherContext.builder() + .k(k) + .isParentHits(true) + // setting to true, so that if quantization details are present we want to do search on the quantized + // vectors as this flow is used in first pass of search. + .useQuantizedVectorsForSearch(true) + .knnQuery(knnQuery); + if (acceptedDocs != null) { + exactSearcherContextBuilder.matchedDocs(acceptedDocs); + } + return exactSearch(context, exactSearcherContextBuilder.build()); + } + private Map doANNSearch( final LeafReaderContext context, final BitSet filterIdsBitSet, @@ -234,7 +229,7 @@ private Map doANNSearch( if (fieldInfo == null) { log.debug("[KNN] Field info not found for {}:{}", knnQuery.getField(), reader.getSegmentName()); - return null; + return Collections.emptyMap(); } KNNEngine knnEngine; @@ -273,8 +268,8 @@ private Map doANNSearch( List engineFiles = KNNCodecUtil.getEngineFiles(knnEngine.getExtension(), knnQuery.getField(), reader.getSegmentInfo().info); if (engineFiles.isEmpty()) { - log.debug("[KNN] No engine index found for field {} for segment {}", knnQuery.getField(), reader.getSegmentName()); - return null; + log.debug("[KNN] No native engine files found for field {} for segment {}", knnQuery.getField(), reader.getSegmentName()); + return Collections.emptyMap(); } Path indexPath = PathUtils.get(directory, engineFiles.get(0)); @@ -363,16 +358,9 @@ private Map doANNSearch( indexAllocation.readUnlock(); indexAllocation.decRef(); } - - /* - * Scores represent the distance of the documents with respect to given query vector. - * Lesser the score, the closer the document is to the query vector. - * Since by default results are retrieved in the descending order of scores, to get the nearest - * neighbors we are inverting the scores. - */ if (results.length == 0) { log.debug("[KNN] Query yielded 0 results"); - return null; + return Collections.emptyMap(); } return Arrays.stream(results) @@ -381,7 +369,6 @@ private Map doANNSearch( /** * Execute exact search for the given matched doc ids and return the results as a map of docId to score. - * * @return Map of docId to score for the exact search results. * @throws IOException If an error occurs during the search. */ @@ -402,7 +389,10 @@ public static float normalizeScore(float score) { return -score + 1; } - private boolean canDoExactSearch(final int filterIdsCount) { + private boolean isFilteredExactSearchPreferred(final int filterIdsCount) { + if (filterWeight == null) { + return false; + } log.debug( "Info for doing exact search filterIdsLength : {}, Threshold value: {}", filterIdsCount, @@ -441,14 +431,59 @@ private boolean isExactSearchThresholdSettingSet(int filterThresholdValue) { return filterThresholdValue != KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE; } + /** + * This condition mainly checks whether exact search should be performed or not + * @param context LeafReaderContext + * @param filterIdsCount count of filtered Doc ids + * @param annResultCount Count of Nearest Neighbours we got after doing filtered ANN Search. + * @return boolean - true if exactSearch needs to be done after ANNSearch. + */ + private boolean isExactSearchRequire(final LeafReaderContext context, final int filterIdsCount, final int annResultCount) { + if (annResultCount == 0 && isMissingNativeEngineFiles(context)) { + log.debug("Perform exact search after approximate search since no native engine files are available"); + return true; + } + if (isFilteredExactSearchRequireAfterANNSearch(filterIdsCount, annResultCount)) { + log.debug( + "Doing ExactSearch after doing ANNSearch as the number of documents returned are less than " + + "K, even when we have more than K filtered Ids. K: {}, ANNResults: {}, filteredIdCount: {}", + this.knnQuery.getK(), + annResultCount, + filterIdsCount + ); + return true; + } + return false; + } + /** * This condition mainly checks during filtered search we have more than K elements in filterIds but the ANN - * doesn't yeild K nearest neighbors. + * doesn't yield K nearest neighbors. * @param filterIdsCount count of filtered Doc ids * @param annResultCount Count of Nearest Neighbours we got after doing filtered ANN Search. * @return boolean - true if exactSearch needs to be done after ANNSearch. */ - private boolean canDoExactSearchAfterANNSearch(final int filterIdsCount, final int annResultCount) { + private boolean isFilteredExactSearchRequireAfterANNSearch(final int filterIdsCount, final int annResultCount) { return filterWeight != null && filterIdsCount >= knnQuery.getK() && knnQuery.getK() > annResultCount; } + + /** + * This condition mainly checks whether segments has native engine files or not + * @return boolean - false if exactSearch needs to be done since no native engine files are in segments. + */ + private boolean isMissingNativeEngineFiles(LeafReaderContext context) { + final SegmentReader reader = Lucene.segmentReader(context.reader()); + final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(knnQuery.getField()); + // if segment has no documents with at least 1 vector field, field info will be null + if (fieldInfo == null) { + return false; + } + final KNNEngine knnEngine = FieldInfoExtractor.extractKNNEngine(fieldInfo); + final List engineFiles = KNNCodecUtil.getEngineFiles( + knnEngine.getExtension(), + knnQuery.getField(), + reader.getSegmentInfo().info + ); + return engineFiles.isEmpty(); + } } diff --git a/src/test/java/org/opensearch/knn/index/OpenSearchIT.java b/src/test/java/org/opensearch/knn/index/OpenSearchIT.java index e9da88c0a..bf8168d37 100644 --- a/src/test/java/org/opensearch/knn/index/OpenSearchIT.java +++ b/src/test/java/org/opensearch/knn/index/OpenSearchIT.java @@ -672,7 +672,7 @@ public void testKNNIndex_whenGetIndexSettingWithDefaultIsCalled_thenReturnDefaul /* For this testcase, we will create index with setting build_vector_data_structure_threshold as -1, then index few documents, perform knn search, - then, confirm no hits since there are no graph. In next step, update setting to 0, force merge segment to 1, perform knn search and confirm expected + then, confirm hits because of exact search though there are no graph. In next step, update setting to 0, force merge segment to 1, perform knn search and confirm expected hits are returned. */ public void testKNNIndex_whenBuildVectorGraphThresholdIsProvidedEndToEnd_thenBuildGraphBasedOnSetting() throws Exception { @@ -730,10 +730,10 @@ public void testKNNIndex_whenBuildVectorGraphThresholdIsProvidedEndToEnd_thenBui assertEquals(testData.indexData.docs.length, getDocCount(indexName)); final List nmslibNeighbors = getResults(indexName, fieldName1, testData.queries[0], 1); - assertEquals("unexpected neighbors are returned", 0, nmslibNeighbors.size()); + assertEquals("unexpected neighbors are returned", nmslibNeighbors.size(), nmslibNeighbors.size()); final List faissNeighbors = getResults(indexName, fieldName2, testData.queries[0], 1); - assertEquals("unexpected neighbors are returned", 0, faissNeighbors.size()); + assertEquals("unexpected neighbors are returned", faissNeighbors.size(), faissNeighbors.size()); // update build vector data structure setting updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD, 0)); diff --git a/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFieldVectorsWriterTests.java b/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFieldVectorsWriterTests.java index 29e3531cf..bdeb1d2b6 100644 --- a/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFieldVectorsWriterTests.java +++ b/src/test/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineFieldVectorsWriterTests.java @@ -97,6 +97,5 @@ public void testRamByteUsed_whenValidInput_thenSuccess() { .create(fieldInfo, InfoStream.getDefault()); // testing for value > 0 as we don't have a concrete way to find out expected bytes. This can OS dependent too. Assert.assertTrue(byteWriter.ramBytesUsed() > 0); - } } diff --git a/src/test/java/org/opensearch/knn/index/query/KNNWeightTests.java b/src/test/java/org/opensearch/knn/index/query/KNNWeightTests.java index f92f32406..7a71c44be 100644 --- a/src/test/java/org/opensearch/knn/index/query/KNNWeightTests.java +++ b/src/test/java/org/opensearch/knn/index/query/KNNWeightTests.java @@ -327,8 +327,9 @@ public void testQueryScoreForFaissWithNonExistingModel() throws IOException { } @SneakyThrows - public void testShardWithoutFiles() { + public void testScorer_whenNoVectorFieldsInDocument_thenEmptyScorerIsReturned() { final KNNQuery query = new KNNQuery(FIELD_NAME, QUERY_VECTOR, K, INDEX_NAME, (BitSetProducer) null); + KNNWeight.initialize(null); final KNNWeight knnWeight = new KNNWeight(query, 0.0f); final LeafReaderContext leafReaderContext = mock(LeafReaderContext.class); @@ -361,8 +362,8 @@ public void testShardWithoutFiles() { final FieldInfos fieldInfos = mock(FieldInfos.class); final FieldInfo fieldInfo = mock(FieldInfo.class); when(reader.getFieldInfos()).thenReturn(fieldInfos); - when(fieldInfos.fieldInfo(any())).thenReturn(fieldInfo); - + // When no knn fields are available , field info for vector field will be null + when(fieldInfos.fieldInfo(FIELD_NAME)).thenReturn(null); final Scorer knnScorer = knnWeight.scorer(leafReaderContext); assertEquals(KNNScorer.emptyScorer(knnWeight), knnScorer); } diff --git a/src/test/java/org/opensearch/knn/integ/BinaryIndexIT.java b/src/test/java/org/opensearch/knn/integ/BinaryIndexIT.java index 93c0c5b16..eed2772b4 100644 --- a/src/test/java/org/opensearch/knn/integ/BinaryIndexIT.java +++ b/src/test/java/org/opensearch/knn/integ/BinaryIndexIT.java @@ -115,7 +115,7 @@ public void testFaissHnswBinary_whenBuildVectorGraphThresholdIsNegativeEndToEnd_ createKnnHnswBinaryIndex(KNNEngine.FAISS, INDEX_NAME, FIELD_NAME, 128, NEVER_BUILD_GRAPH); ingestTestData(INDEX_NAME, FIELD_NAME); - assertEquals(0, runKnnQuery(INDEX_NAME, FIELD_NAME, testData.queries[0], 1).size()); + assertEquals(1, runKnnQuery(INDEX_NAME, FIELD_NAME, testData.queries[0], 1).size()); // update build vector data structure setting updateIndexSettings(INDEX_NAME, Settings.builder().put(KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD, 0)); @@ -138,7 +138,7 @@ public void testFaissHnswBinary_whenBuildVectorGraphThresholdIsProvidedEndToEnd_ createKnnHnswBinaryIndex(KNNEngine.FAISS, INDEX_NAME, FIELD_NAME, 128, testData.indexData.docs.length); ingestTestData(INDEX_NAME, FIELD_NAME, false); - assertEquals(0, runKnnQuery(INDEX_NAME, FIELD_NAME, testData.queries[0], 1).size()); + assertEquals(1, runKnnQuery(INDEX_NAME, FIELD_NAME, testData.queries[0], 1).size()); // update build vector data structure setting updateIndexSettings(INDEX_NAME, Settings.builder().put(KNNSettings.INDEX_KNN_BUILD_VECTOR_DATA_STRUCTURE_THRESHOLD, 0));