Skip to content

Commit 70573d0

Browse files
committed
Refactor code structure for improved readability and maintainability
1 parent 1fae85c commit 70573d0

File tree

8 files changed

+1844
-628
lines changed

8 files changed

+1844
-628
lines changed

README.md

Lines changed: 0 additions & 592 deletions
This file was deleted.

config.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ benchmark:
2121
top_k: 20
2222
embedding:
2323
dense:
24-
api_key_env: GOOGLE_API_KEY
24+
api_key_env: VOYAGE_API_KEY
2525
batch_size: 32
26-
dimensions: 768
27-
model: models/embedding-001
28-
provider: google
26+
dimensions: 1024
27+
model: voyage-3.5-lite
28+
provider: voyage
2929
vector_name: dense
3030
sparse:
3131
model: Qdrant/bm25

embedding/factory.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from embedding.sparse_embedder import SparseEmbedder
44
from langchain_qdrant import FastEmbedSparse
55
from langchain_google_genai import GoogleGenerativeAIEmbeddings
6+
from langchain_voyageai import VoyageAIEmbeddings
7+
68
import os
79

810

@@ -35,9 +37,39 @@ def get_embedder(cfg: dict):
3537

3638
elif provider == "google":
3739
model_name = cfg.get("model", "models/embedding-001")
38-
return GoogleGenerativeAIEmbeddings(
40+
dimensions = cfg.get("dimensions") or cfg.get("output_dimensionality")
41+
api_key = os.getenv("GOOGLE_API_KEY")
42+
43+
if not api_key:
44+
raise ValueError(
45+
"GOOGLE_API_KEY environment variable is required for Google embeddings")
46+
47+
# Initialize with or without dimensions parameter
48+
if dimensions:
49+
return GoogleGenerativeAIEmbeddings(
50+
model=model_name,
51+
google_api_key=api_key,
52+
output_dimensionality=dimensions
53+
)
54+
else:
55+
return GoogleGenerativeAIEmbeddings(
56+
model=model_name,
57+
google_api_key=api_key
58+
)
59+
60+
elif provider == "voyage":
61+
model_name = cfg.get("model", "voyage-3.5-lite")
62+
api_key = os.getenv("VOYAGE_API_KEY")
63+
64+
if not api_key:
65+
raise ValueError(
66+
"VOYAGE_API_KEY environment variable is required for Voyage embeddings")
67+
68+
# VoyageAI embeddings use native dimensions (1024 for voyage-3.5)
69+
# Dimension reduction can be handled via truncation if needed
70+
return VoyageAIEmbeddings(
3971
model=model_name,
40-
google_api_key=os.getenv("GOOGLE_API_KEY")
72+
voyage_api_key=api_key
4173
)
4274

4375
elif provider == "sparse":
@@ -48,5 +80,5 @@ def get_embedder(cfg: dict):
4880

4981
else:
5082
raise ValueError(
51-
f"Unsupported embedder provider: '{provider}'. Supported: hf, titan, fastembed, sparse, google"
83+
f"Unsupported embedder provider: '{provider}'. Supported: hf, titan, fastembed, sparse, google, voyage"
5284
)

pipelines/configs/datasets/stackoverflow.yml

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,21 @@ dataset:
66
description: "SOSum: Extractive summaries of Stack Overflow posts (506 questions, 2278 posts)"
77

88
# Embedding strategy
9-
embedding_strategy: hybrid
10-
119
embedding:
10+
strategy: "hybrid"
1211
dense:
13-
provider: hf
14-
model_name: sentence-transformers/all-MiniLM-L6-v2
12+
provider: "google"
13+
model: "models/embedding-001"
1514
batch_size: 32
16-
device: cuda
17-
vector_name: dense
1815
sparse:
19-
provider: fastembed
20-
model_name: Qdrant/bm25
21-
vector_name: sparse
16+
provider: "sparse"
17+
model: "Qdrant/bm25"
18+
batch_size: 32
2219

2320
# Chunking configuration
2421
chunking:
25-
strategy: code_aware # Best for code-heavy content
26-
chunk_size: 800 # Larger chunks for code context
22+
strategy: "code_aware" # Best for code-heavy content
23+
chunk_size: 800 # Larger chunks for code context
2724
chunk_overlap: 100
2825
preserve_functions: true
2926
preserve_code_blocks: true
@@ -39,14 +36,14 @@ validation:
3936

4037
# Retriever configuration
4138
retriever:
42-
type: qdrant
39+
type: "qdrant"
4340
top_k: 15
4441

4542
# Qdrant settings
4643
qdrant:
47-
collection: sosum_stackoverflow_v1
48-
dense_vector_name: dense
49-
sparse_vector_name: sparse
44+
collection: "sosum_stackoverflow_v1"
45+
dense_vector_name: "dense"
46+
sparse_vector_name: "sparse"
5047

5148
# Upload settings
5249
upload:
Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,43 @@
1-
# Configuration for Hybrid Dense + Sparse Embeddings
2-
31
dataset:
42
name: "stackoverflow_sosum"
53
version: "v1.0.0"
6-
adapter: "stackoverflow" # REQUIRED: This was missing!
4+
adapter: "stackoverflow"
75

86
chunking:
9-
strategy: "recursive" # FIXED: Strategy name
7+
strategy: "recursive"
108
chunk_size: 512
119
chunk_overlap: 50
12-
separators: ["\n\n", "\n", " ", ""] # REQUIRED for recursive chunking
10+
separators: ["\n\n", "\n", " ", ""]
1311

1412
embedding:
15-
strategy: "hybrid" # FIXED: Moved from top level
13+
strategy: "hybrid"
1614
dense:
1715
provider: "google"
18-
model: "models/embedding-001" # FIXED: changed from model_name to model
16+
model: "models/embedding-001"
17+
dimensions: 1536 # Available: 128, 256, 512, 768, 1536, 3072 (default: 3072)
1918
batch_size: 32
2019
sparse:
2120
provider: "sparse"
22-
model: "Qdrant/bm25" # FIXED: changed from model_name to model
21+
model: "Qdrant/bm25"
2322
batch_size: 32
2423

2524
qdrant:
2625
collection: "sosum_stackoverflow_hybrid_v1"
2726
dense_vector_name: "dense"
2827
sparse_vector_name: "sparse"
29-
distance_metric: "cosine" # REQUIRED: Added missing field
28+
distance_metric: "cosine"
3029

3130
upload:
3231
batch_size: 50
3332
wait: true
34-
versioning: true # ADDED: For proper versioning
33+
versioning: true
3534

3635
validation:
37-
enabled: true # REQUIRED: Added missing field
38-
max_text_length: 10000 # FIXED: changed from max_char_length
39-
min_text_length: 10 # FIXED: changed from min_char_length
36+
enabled: true
37+
max_text_length: 10000
38+
min_text_length: 10
4039

4140
smoke_tests:
4241
enabled: true
43-
sample_size: 5 # REQUIRED: Added missing field
42+
sample_size: 5
4443
min_success_rate: 0.7
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
embedding:
2+
strategy: "hybrid"
3+
dense:
4+
provider: "voyage"
5+
model: "voyage-3.5-lite"
6+
batch_size: 32
7+
dimensions: 1024
8+
sparse:
9+
provider: "sparse"
10+
model: "Qdrant/bm25"
11+
batch_size: 8
12+
13+
qdrant:
14+
collection: "stackoverflow_voyage_lite"
15+
host: "localhost"
16+
port: 6333
17+
timeout: 300
18+
distance: "Cosine"
19+
20+
chunking:
21+
method: "recursive"
22+
chunk_size: 500
23+
chunk_overlap: 100
24+
separators: ["\n\n", "\n", " ", ""]
25+
26+
processing:
27+
validate_documents: true
28+
enable_duplicate_detection: true
29+
similarity_threshold: 0.95
30+
batch_size: 100
31+
max_retries: 3
32+
retry_delay: 1.0
33+
34+
logging:
35+
level: "INFO"
36+
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# SOSum Stack Overflow Dataset Configuration with Voyage AI Premium Embeddings
2+
# Dataset: https://github.com/BonanKou/SOSum-A-Dataset-of-Extractive-Summaries-of-Stack-Overflow-Posts-and-labeling-tools
3+
dataset:
4+
name: "stackoverflow_sosum"
5+
version: "v1.0.0"
6+
adapter: "stackoverflow"
7+
8+
chunking:
9+
strategy: "recursive"
10+
chunk_size: 512
11+
chunk_overlap: 50
12+
separators: ["\n\n", "\n", " ", ""]
13+
14+
embedding:
15+
strategy: "hybrid"
16+
dense:
17+
provider: "voyage"
18+
model: "voyage-3.5" # Premium option: $0.06/1M tokens (better quality)
19+
dimensions: 1024 # Full dimension for maximum quality
20+
batch_size: 32
21+
sparse:
22+
provider: "sparse"
23+
model: "Qdrant/bm25"
24+
batch_size: 32
25+
26+
qdrant:
27+
collection: "sosum_stackoverflow_voyage_premium_v1"
28+
dense_vector_name: "dense"
29+
sparse_vector_name: "sparse"
30+
31+
upload:
32+
batch_size: 50
33+
wait: true
34+
versioning: true
35+
36+
validation:
37+
min_char_length: 30
38+
max_char_length: 50000
39+
remove_duplicates: true
40+
clean_html: true
41+
preserve_code_blocks: true
42+
allowed_languages: ["en"]
43+
44+
# Evaluation settings
45+
evaluation:
46+
k_values: [1, 3, 5, 10, 15]
47+
similarity_threshold: 0.7
48+
49+
# Smoke tests
50+
smoke_tests:
51+
min_success_rate: 0.8
52+
golden_queries:
53+
- query: "Python list comprehension example"
54+
min_recall: 0.1
55+
- query: "JavaScript async function"
56+
min_recall: 0.1
57+
- query: "How to solve error in code"
58+
min_recall: 0.1
59+
- query: "Best practice programming"
60+
min_recall: 0.1
61+
62+
# Output configuration
63+
output_dir: "output/sosum_stackoverflow_voyage_premium"
64+
65+
# Embedding cache
66+
embedding_cache:
67+
enabled: true
68+
dir: "cache/embeddings/sosum_stackoverflow_voyage_premium"

0 commit comments

Comments
 (0)