Skip to content

Commit 87e97aa

Browse files
MB-66396: IndexSnapshot API to retrieve highest cardinality centroids
1 parent be98005 commit 87e97aa

File tree

4 files changed

+181
-9
lines changed

4 files changed

+181
-9
lines changed

go.mod

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@ require (
99
github.com/bits-and-blooms/bitset v1.22.0
1010
github.com/blevesearch/bleve_index_api v1.2.8
1111
github.com/blevesearch/geo v0.2.4
12-
github.com/blevesearch/go-faiss v1.0.25
12+
github.com/blevesearch/go-faiss v1.0.26-0.20250808221324-a1d3051837ea
1313
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
1414
github.com/blevesearch/go-porterstemmer v1.0.3
1515
github.com/blevesearch/goleveldb v1.0.1
1616
github.com/blevesearch/gtreap v0.1.1
17-
github.com/blevesearch/scorch_segment_api/v2 v2.3.10
17+
github.com/blevesearch/scorch_segment_api/v2 v2.3.11-0.20250812224933-05c6687cb15b
1818
github.com/blevesearch/segment v0.9.1
1919
github.com/blevesearch/snowball v0.6.1
2020
github.com/blevesearch/snowballstem v0.9.0
@@ -26,7 +26,7 @@ require (
2626
github.com/blevesearch/zapx/v13 v13.4.2
2727
github.com/blevesearch/zapx/v14 v14.4.2
2828
github.com/blevesearch/zapx/v15 v15.4.2
29-
github.com/blevesearch/zapx/v16 v16.2.4
29+
github.com/blevesearch/zapx/v16 v16.2.5-0.20250812225830-3e3f5b307941
3030
github.com/couchbase/moss v0.2.0
3131
github.com/spf13/cobra v1.8.1
3232
go.etcd.io/bbolt v1.4.0

go.sum

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ github.com/blevesearch/bleve_index_api v1.2.8 h1:Y98Pu5/MdlkRyLM0qDHostYo7i+Vv1c
77
github.com/blevesearch/bleve_index_api v1.2.8/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0=
88
github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk=
99
github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8=
10-
github.com/blevesearch/go-faiss v1.0.25 h1:lel1rkOUGbT1CJ0YgzKwC7k+XH0XVBHnCVWahdCXk4U=
11-
github.com/blevesearch/go-faiss v1.0.25/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
10+
github.com/blevesearch/go-faiss v1.0.26-0.20250808221324-a1d3051837ea h1:xiz5c7fyBrgmt/TVWRM+8LVNFQS7hDjlyfAjliumDWM=
11+
github.com/blevesearch/go-faiss v1.0.26-0.20250808221324-a1d3051837ea/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
1212
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:kDy+zgJFJJoJYBvdfBSiZYBbdsUL0XcjHYWezpQBGPA=
1313
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:9eJDeqxJ3E7WnLebQUlPD7ZjSce7AnDb9vjGmMCbD0A=
1414
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
@@ -20,8 +20,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY
2020
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
2121
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
2222
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
23-
github.com/blevesearch/scorch_segment_api/v2 v2.3.10 h1:Yqk0XD1mE0fDZAJXTjawJ8If/85JxnLd8v5vG/jWE/s=
24-
github.com/blevesearch/scorch_segment_api/v2 v2.3.10/go.mod h1:Z3e6ChN3qyN35yaQpl00MfI5s8AxUJbpTR/DL8QOQ+8=
23+
github.com/blevesearch/scorch_segment_api/v2 v2.3.11-0.20250812224933-05c6687cb15b h1:dj5AYwlxpsqRXTmnO4nqUBiO5RvBdPdN5tVPJscZpTs=
24+
github.com/blevesearch/scorch_segment_api/v2 v2.3.11-0.20250812224933-05c6687cb15b/go.mod h1:Z3e6ChN3qyN35yaQpl00MfI5s8AxUJbpTR/DL8QOQ+8=
2525
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
2626
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
2727
github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
@@ -44,8 +44,8 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT
4444
github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8=
4545
github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k=
4646
github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
47-
github.com/blevesearch/zapx/v16 v16.2.4 h1:tGgfvleXTAkwsD5mEzgM3zCS/7pgocTCnO1oyAUjlww=
48-
github.com/blevesearch/zapx/v16 v16.2.4/go.mod h1:Rti/REtuuMmzwsI8/C/qIzRaEoSK/wiFYw5e5ctUKKs=
47+
github.com/blevesearch/zapx/v16 v16.2.5-0.20250812225830-3e3f5b307941 h1:U1IBoeRBOtwrmyt3PrYJMTIZdsecdpS1epFfKdycqww=
48+
github.com/blevesearch/zapx/v16 v16.2.5-0.20250812225830-3e3f5b307941/go.mod h1:VWpjVlGQxtXK5dUEGqum0RhloAiHNYQYWKzrP+zFi68=
4949
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
5050
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
5151
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=

index/scorch/scorch_knn_test.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// Copyright (c) 2025 Couchbase, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
//go:build vectors
16+
// +build vectors
17+
18+
package scorch
19+
20+
import (
21+
"fmt"
22+
"log"
23+
"math/rand"
24+
"testing"
25+
"time"
26+
27+
"github.com/blevesearch/bleve/v2/document"
28+
"github.com/blevesearch/bleve/v2/mapping"
29+
index "github.com/blevesearch/bleve_index_api"
30+
)
31+
32+
func TestIndexSnapshotHighestCardinalityCentroids(t *testing.T) {
33+
cfg := CreateConfig("TestIndexSnapshotHighestCardinalityCentroids")
34+
err := InitTest(cfg)
35+
if err != nil {
36+
t.Fatal(err)
37+
}
38+
defer func() {
39+
err := DestroyTest(cfg)
40+
if err != nil {
41+
t.Log(err)
42+
}
43+
}()
44+
45+
testConfig := cfg
46+
mp := mapping.NewIndexMapping()
47+
48+
vectorDims := 5
49+
50+
vecMapping := mapping.NewVectorFieldMapping()
51+
vecMapping.Dims = vectorDims
52+
vecMapping.Similarity = index.CosineSimilarity
53+
54+
docMapping := mapping.NewDocumentStaticMapping()
55+
docMapping.AddFieldMappingsAt("vec", vecMapping)
56+
mp.DefaultMapping = docMapping
57+
58+
analysisQueue := index.NewAnalysisQueue(1)
59+
idx, err := NewScorch("storeName", testConfig, analysisQueue)
60+
if err != nil {
61+
log.Fatalln(err)
62+
}
63+
err = idx.Open()
64+
if err != nil {
65+
t.Errorf("error opening index: %v", err)
66+
}
67+
defer func() {
68+
err := idx.Close()
69+
if err != nil {
70+
t.Fatal(err)
71+
}
72+
}()
73+
74+
rand.Seed(time.Now().UnixNano())
75+
min, max := float32(-10.0), float32(10.0)
76+
genRandomVector := func() []float32 {
77+
vec := make([]float32, vectorDims)
78+
for i := range vec {
79+
vec[i] = min + rand.Float32()*(max-min)
80+
}
81+
return vec
82+
}
83+
84+
var batch *index.Batch
85+
for i := 1; i <= 20000; i++ {
86+
doc := document.NewDocument(fmt.Sprintf("doc-%d", i))
87+
err = mp.MapDocument(doc, map[string]interface{}{
88+
"vec": genRandomVector(),
89+
})
90+
if err != nil {
91+
t.Errorf("error mapping doc: %v", err)
92+
}
93+
if batch == nil {
94+
batch = index.NewBatch()
95+
}
96+
batch.Update(doc)
97+
98+
if i%200 == 0 {
99+
err = idx.Batch(batch)
100+
if err != nil {
101+
t.Errorf("Error adding batch to index: %v", err)
102+
}
103+
batch = nil
104+
}
105+
}
106+
107+
if batch != nil {
108+
// In case doc count is not a multiple of 200, we need to add the final batch
109+
err = idx.Batch(batch)
110+
if err != nil {
111+
t.Errorf("Error adding final batch to index: %v", err)
112+
}
113+
}
114+
115+
reader, err := idx.Reader()
116+
if err != nil {
117+
t.Fatal(err)
118+
}
119+
defer func() {
120+
err := reader.Close()
121+
if err != nil {
122+
t.Fatal(err)
123+
}
124+
}()
125+
126+
limit := 5
127+
if snap, ok := reader.(*IndexSnapshot); ok {
128+
centroids, err := snap.HighestCardinalityCentroids("vec", limit)
129+
if err != nil {
130+
t.Fatal(err)
131+
}
132+
fmt.Println(centroids)
133+
}
134+
}

index/scorch/snapshot_index_vr.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"encoding/json"
2424
"fmt"
2525
"reflect"
26+
"sort"
2627

2728
"github.com/blevesearch/bleve/v2/size"
2829
index "github.com/blevesearch/bleve_index_api"
@@ -163,3 +164,40 @@ func (i *IndexSnapshotVectorReader) Close() error {
163164
// TODO Consider if any scope of recycling here.
164165
return nil
165166
}
167+
168+
func (i *IndexSnapshot) HighestCardinalityCentroids(field string, limit int) (
169+
centroids []segment_api.CentroidCardinality, err error) {
170+
if len(i.segment) == 0 {
171+
return nil, fmt.Errorf("no segments available")
172+
}
173+
174+
if limit <= 0 {
175+
return nil, fmt.Errorf("limit must be positive")
176+
}
177+
178+
rvCentroids := make([]segment_api.CentroidCardinality, limit)
179+
180+
for _, segment := range i.segment {
181+
if sv, ok := segment.segment.(segment_api.VectorSegment); ok {
182+
vecIndex, err := sv.InterpretVectorIndex(field,
183+
false /* does not require filtering */, segment.deleted)
184+
if err != nil {
185+
return nil, fmt.Errorf("failed to interpret vector index for field %s in segment: %v", field, err)
186+
}
187+
188+
centroidCardinalities, err := vecIndex.ObtainTopKCentroidCardinalitiesFromIVFIndex(limit)
189+
if err != nil {
190+
return nil, fmt.Errorf("failed to obtain top k centroid cardinalities for field %s in segment: %v", field, err)
191+
}
192+
193+
centroidCardinalities = append(centroidCardinalities, rvCentroids...)
194+
sort.Slice(centroidCardinalities, func(i, j int) bool {
195+
return centroidCardinalities[i].Cardinality > centroidCardinalities[j].Cardinality
196+
})
197+
198+
rvCentroids = centroidCardinalities[:limit]
199+
}
200+
}
201+
202+
return rvCentroids, nil
203+
}

0 commit comments

Comments
 (0)