Skip to content

Commit be98005

Browse files
MB-66396: IndexSnapshot API to retrieve highest frequency terms
1 parent 3ef8f85 commit be98005

File tree

2 files changed

+200
-0
lines changed

2 files changed

+200
-0
lines changed

index/scorch/scorch_test.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2810,3 +2810,117 @@ func TestPersistorMergerOptions(t *testing.T) {
28102810
}
28112811
}
28122812
}
2813+
2814+
func TestIndexSnapshotHighestFrequencyTerms(t *testing.T) {
2815+
cfg := CreateConfig("TestIndexSnapshotHighestFrequencyTerms")
2816+
err := InitTest(cfg)
2817+
if err != nil {
2818+
t.Fatal(err)
2819+
}
2820+
defer func() {
2821+
err := DestroyTest(cfg)
2822+
if err != nil {
2823+
t.Log(err)
2824+
}
2825+
}()
2826+
2827+
testConfig := cfg
2828+
mp := mapping.NewIndexMapping()
2829+
2830+
textMapping := mapping.NewTextFieldMapping()
2831+
textMapping.Analyzer = "en"
2832+
2833+
docMapping := mapping.NewDocumentStaticMapping()
2834+
docMapping.AddFieldMappingsAt("text", textMapping)
2835+
2836+
mp.DefaultMapping = docMapping
2837+
2838+
analysisQueue := index.NewAnalysisQueue(1)
2839+
idx, err := NewScorch("storeName", testConfig, analysisQueue)
2840+
if err != nil {
2841+
log.Fatalln(err)
2842+
}
2843+
err = idx.Open()
2844+
if err != nil {
2845+
t.Errorf("error opening index: %v", err)
2846+
}
2847+
defer func() {
2848+
err := idx.Close()
2849+
if err != nil {
2850+
t.Fatal(err)
2851+
}
2852+
}()
2853+
2854+
data := []map[string]string{
2855+
{
2856+
"id": "one",
2857+
"text": "She sells sea shells by the sea shore",
2858+
},
2859+
{
2860+
"id": "two",
2861+
"text": "The quick brown fox jumps over the lazy dog",
2862+
},
2863+
{
2864+
"id": "three",
2865+
"text": "She sold sea shells to the person with the dog",
2866+
},
2867+
{
2868+
"id": "four",
2869+
"text": "But there are a lot of dogs on the beach",
2870+
},
2871+
{
2872+
"id": "five",
2873+
"text": "To hell with the foxes",
2874+
},
2875+
{
2876+
"id": "six",
2877+
"text": "What about the dogs",
2878+
},
2879+
{
2880+
"id": "seven",
2881+
"text": "Dogs are OK, foxes are not",
2882+
},
2883+
}
2884+
2885+
for _, d := range data {
2886+
doc := document.NewDocument(d["id"])
2887+
err = mp.MapDocument(doc, d)
2888+
if err != nil {
2889+
t.Errorf("error mapping doc: %v", err)
2890+
}
2891+
2892+
err = idx.Update(doc)
2893+
if err != nil {
2894+
t.Errorf("Error updating index: %v", err)
2895+
}
2896+
}
2897+
2898+
reader, err := idx.Reader()
2899+
if err != nil {
2900+
t.Fatal(err)
2901+
}
2902+
defer func() {
2903+
err := reader.Close()
2904+
if err != nil {
2905+
t.Fatal(err)
2906+
}
2907+
}()
2908+
2909+
limit := 5
2910+
2911+
// FIXME: Type assert to IndexSnapshot for now
2912+
if snap, ok := reader.(*IndexSnapshot); ok {
2913+
termFreqs, err := snap.HighestFrequencyTerms("text", limit)
2914+
if err != nil {
2915+
t.Fatal(err)
2916+
}
2917+
2918+
if len(termFreqs) > limit {
2919+
t.Fatal("terms freqs have more than limit")
2920+
}
2921+
2922+
for i := range termFreqs {
2923+
fmt.Println(termFreqs[i].Term, termFreqs[i].Frequency)
2924+
}
2925+
}
2926+
}

index/scorch/snapshot_index.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"path/filepath"
2424
"reflect"
2525
"sort"
26+
"strings"
2627
"sync"
2728
"sync/atomic"
2829

@@ -1161,3 +1162,88 @@ func (is *IndexSnapshot) ThesaurusKeysRegexp(name string,
11611162
func (is *IndexSnapshot) UpdateSynonymSearchCount(delta uint64) {
11621163
atomic.AddUint64(&is.parent.stats.TotSynonymSearches, delta)
11631164
}
1165+
1166+
type TermFreq struct {
1167+
Term string `json:"term"`
1168+
Frequency uint64 `json:"frequency"`
1169+
}
1170+
1171+
// HighestFrequencyTerms returns the top N terms with the highest frequencies
1172+
// for a given field across all segments in the index snapshot.
1173+
// Returns a slice of term-frequency pairs sorted by frequency (descending).
1174+
func (is *IndexSnapshot) HighestFrequencyTerms(field string, limit int) (
1175+
termsFreqs []TermFreq, err error) {
1176+
if len(is.segment) == 0 {
1177+
return nil, fmt.Errorf("no segments available")
1178+
}
1179+
1180+
if limit <= 0 {
1181+
return nil, fmt.Errorf("limit must be positive")
1182+
}
1183+
1184+
// Use a map to aggregate frequencies across segments
1185+
termFreqs := make(map[string]uint64)
1186+
1187+
// Process each segment to collect term frequencies
1188+
for _, segment := range is.segment {
1189+
dict, err := segment.segment.Dictionary(field)
1190+
if err != nil {
1191+
return nil, fmt.Errorf("failed to get dictionary for field %s in segment: %v", field, err)
1192+
}
1193+
1194+
// Get iterator for all terms in this segment
1195+
itr := dict.AutomatonIterator(nil, nil, nil)
1196+
if itr == nil {
1197+
continue
1198+
}
1199+
1200+
// Iterate through all terms in this segment
1201+
for {
1202+
next, err := itr.Next()
1203+
if err != nil {
1204+
return nil, fmt.Errorf("error iterating dictionary: %v", err)
1205+
}
1206+
if next == nil {
1207+
break // End of terms
1208+
}
1209+
1210+
// Get postings list for this term
1211+
pl, err := dict.PostingsList([]byte(next.Term), segment.deleted, nil)
1212+
if err != nil {
1213+
continue // Skip this term if we can't get postings
1214+
}
1215+
1216+
// Aggregate frequency across segments
1217+
termStr := string(next.Term)
1218+
termFreqs[termStr] += pl.Count()
1219+
}
1220+
}
1221+
1222+
if len(termFreqs) == 0 {
1223+
return nil, fmt.Errorf("no terms found for field %s", field)
1224+
}
1225+
1226+
var termFreqList []TermFreq
1227+
for termStr, freq := range termFreqs {
1228+
termFreqList = append(termFreqList, TermFreq{
1229+
Term: termStr,
1230+
Frequency: freq,
1231+
})
1232+
}
1233+
1234+
// Sort by frequency (descending)
1235+
sort.Slice(termFreqList, func(i, j int) bool {
1236+
if termFreqList[i].Frequency == termFreqList[j].Frequency {
1237+
// If frequencies are equal, sort by term lexicographically
1238+
return strings.Compare(termFreqList[i].Term, termFreqList[j].Term) < 0
1239+
}
1240+
return termFreqList[i].Frequency > termFreqList[j].Frequency
1241+
})
1242+
1243+
// Limit results
1244+
if limit > len(termFreqList) {
1245+
limit = len(termFreqList)
1246+
}
1247+
1248+
return termFreqList[:limit], nil
1249+
}

0 commit comments

Comments
 (0)