Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ type UpdateConfig struct {
type SearchConfig struct {
Boost BoostConfig `yaml:"boost"`
Hybrid HybridConfig `yaml:"hybrid"`
Dedup DedupConfig `yaml:"dedup"`
}

// DedupConfig controls file-level deduplication of search results.
type DedupConfig struct {
Enabled bool `yaml:"enabled"`
}

type HybridConfig struct {
Expand Down Expand Up @@ -300,6 +306,9 @@ func DefaultConfig() *Config {
RPGMaxDirtyFilesPerBatch: DefaultWatchRPGMaxDirtyFilesPerBatch,
},
Search: SearchConfig{
Dedup: DedupConfig{
Enabled: true,
},
Hybrid: HybridConfig{
Enabled: false,
K: 60,
Expand Down
17 changes: 17 additions & 0 deletions search/dedup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package search

import "github.com/yoanbernabeu/grepai/store"

// DeduplicateByFile keeps only the highest-scoring chunk per file path.
func DeduplicateByFile(results []store.SearchResult) []store.SearchResult {
seen := make(map[string]bool, len(results))
deduped := make([]store.SearchResult, 0, len(results))
for _, r := range results {
if seen[r.Chunk.FilePath] {
continue
}
seen[r.Chunk.FilePath] = true
deduped = append(deduped, r)
}
return deduped
}
62 changes: 62 additions & 0 deletions search/dedup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package search

import (
"testing"

"github.com/yoanbernabeu/grepai/store"
)

func TestDeduplicateByFile(t *testing.T) {
results := []store.SearchResult{
{Chunk: store.Chunk{ID: "a_0", FilePath: "a.go"}, Score: 0.9},
{Chunk: store.Chunk{ID: "b_0", FilePath: "b.go"}, Score: 0.8},
{Chunk: store.Chunk{ID: "a_1", FilePath: "a.go"}, Score: 0.7},
{Chunk: store.Chunk{ID: "c_0", FilePath: "c.go"}, Score: 0.6},
{Chunk: store.Chunk{ID: "b_1", FilePath: "b.go"}, Score: 0.5},
}

deduped := DeduplicateByFile(results)

if len(deduped) != 3 {
t.Fatalf("expected 3 results, got %d", len(deduped))
}

expected := []struct {
id string
score float32
}{
{"a_0", 0.9},
{"b_0", 0.8},
{"c_0", 0.6},
}

for i, want := range expected {
if deduped[i].Chunk.ID != want.id {
t.Errorf("result[%d]: expected ID %q, got %q", i, want.id, deduped[i].Chunk.ID)
}
if deduped[i].Score != want.score {
t.Errorf("result[%d]: expected score %v, got %v", i, want.score, deduped[i].Score)
}
}
}

func TestDeduplicateByFile_Empty(t *testing.T) {
deduped := DeduplicateByFile(nil)
if len(deduped) != 0 {
t.Fatalf("expected 0 results, got %d", len(deduped))
}
}

func TestDeduplicateByFile_AllUnique(t *testing.T) {
results := []store.SearchResult{
{Chunk: store.Chunk{ID: "a_0", FilePath: "a.go"}, Score: 0.9},
{Chunk: store.Chunk{ID: "b_0", FilePath: "b.go"}, Score: 0.8},
{Chunk: store.Chunk{ID: "c_0", FilePath: "c.go"}, Score: 0.7},
}

deduped := DeduplicateByFile(results)

if len(deduped) != 3 {
t.Fatalf("expected 3 results, got %d", len(deduped))
}
}
23 changes: 12 additions & 11 deletions search/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ type Searcher struct {
embedder embedder.Embedder
boostCfg config.BoostConfig
hybridCfg config.HybridConfig
dedupCfg config.DedupConfig
}

func NewSearcher(st store.VectorStore, emb embedder.Embedder, searchCfg config.SearchConfig) *Searcher {
Expand All @@ -21,37 +22,40 @@ func NewSearcher(st store.VectorStore, emb embedder.Embedder, searchCfg config.S
embedder: emb,
boostCfg: searchCfg.Boost,
hybridCfg: searchCfg.Hybrid,
dedupCfg: searchCfg.Dedup,
}
}

func (s *Searcher) Search(ctx context.Context, query string, limit int, pathPrefix string) ([]store.SearchResult, error) {
// Embed the query
queryVector, err := s.embedder.Embed(ctx, query)
if err != nil {
return nil, err
}

// Fetch more results to allow re-ranking
fetchLimit := limit * 2
fetchMultiplier := 2
if s.dedupCfg.Enabled {
fetchMultiplier = 4
}
fetchLimit := limit * fetchMultiplier

var results []store.SearchResult

if s.hybridCfg.Enabled {
// Hybrid search: combine vector + text search with RRF
results, err = s.hybridSearch(ctx, query, queryVector, fetchLimit, pathPrefix)
} else {
// Vector-only search
results, err = s.store.Search(ctx, queryVector, fetchLimit, store.SearchOptions{PathPrefix: pathPrefix})
}

if err != nil {
return nil, err
}

// Apply structural boosting
results = ApplyBoost(results, s.boostCfg)

// Trim to requested limit
if s.dedupCfg.Enabled {
results = DeduplicateByFile(results)
}

if len(results) > limit {
results = results[:limit]
}
Expand All @@ -61,24 +65,21 @@ func (s *Searcher) Search(ctx context.Context, query string, limit int, pathPref

// hybridSearch combines vector search and text search using RRF.
func (s *Searcher) hybridSearch(ctx context.Context, query string, queryVector []float32, limit int, pathPrefix string) ([]store.SearchResult, error) {
// Vector search
vectorResults, err := s.store.Search(ctx, queryVector, limit, store.SearchOptions{PathPrefix: pathPrefix})
if err != nil {
return nil, err
}

// Text search (get all chunks first)
allChunks, err := s.store.GetAllChunks(ctx)
if err != nil {
return nil, err
}

textResults := TextSearch(ctx, allChunks, query, limit, pathPrefix)

// Combine with RRF
k := s.hybridCfg.K
if k <= 0 {
k = 60 // default
k = 60
}

return ReciprocalRankFusion(k, limit, vectorResults, textResults), nil
Expand Down