spyrchat · spyrchat · Apr 26, 2025 · Apr 26, 2025 · Apr 26, 2025 · Apr 27, 2025
diff --git a/.github/workflows/pipeline-tests.yml b/.github/workflows/pipeline-tests.yml
@@ -0,0 +1,200 @@
+name: Pipeline Tests
+
+on:
+  push:
+    branches: [ main, development ]
+  pull_request:
+    branches: [ main, development ]
+
+jobs:
+  test-minimal:
+    name: Minimal Pipeline Tests
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.13'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install -r tests/requirements-minimal.txt
+
+      - name: Run minimal pipeline tests (no external services)
+        run: |
+          python -m pytest tests/pipeline/test_minimal_pipeline.py tests/pipeline/test_components.py -v --tb=short
+
+      - name: Run configuration validation
+        run: |
+          python -c "
+          import yaml
+          import sys
+
+          # Test that all YAML configs are valid
+          configs = ['config.yml', 'pipelines/configs/retrieval/ci_google_gemini.yml']
+          for config in configs:
+              try:
+                  with open(config) as f:
+                      yaml.safe_load(f)
+                  print(f'✅ {config} is valid')
+              except Exception as e:
+                  print(f'❌ {config} failed: {e}')
+                  sys.exit(1)
+          "
+
+  test-integration:
+    name: Integration Tests with Qdrant
+    runs-on: ubuntu-latest
+
+    services:
+      qdrant:
+        image: qdrant/qdrant:latest
+        ports:
+          - 6333:6333
+        options: >-
+          --health-cmd "curl -f http://localhost:6333/collections || exit 1"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 10
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.13'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install -r tests/requirements-test.txt
+
+      - name: Wait for Qdrant to be ready
+        run: |
+          timeout 60 bash -c 'until curl -f http://localhost:6333/collections; do sleep 2; done'
+          echo "Qdrant is ready!"
+
+      - name: Test Qdrant connectivity
+        run: |
+          python -m pytest tests/pipeline/test_qdrant_connectivity.py -v --tb=short
+
+      - name: Run basic integration tests
+        run: |
+          python -m pytest tests/pipeline/ -v --tb=short -m "not requires_api"
+
+  test-end-to-end:
+    name: End-to-End Pipeline Tests
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository
+
+    services:
+      qdrant:
+        image: qdrant/qdrant:latest
+        ports:
+          - 6333:6333
+        options: >-
+          --health-cmd "curl -f http://localhost:6333/collections || exit 1"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 10
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.13'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install -r tests/requirements-test.txt
+
+      - name: Wait for Qdrant to be ready
+        run: |
+          timeout 60 bash -c 'until curl -f http://localhost:6333/collections; do sleep 2; done'
+          echo "Qdrant is ready!"
+
+      - name: Test Qdrant health
+        run: |
+          curl -f http://localhost:6333/collections
+          echo "Qdrant collections endpoint working"
+
+      - name: Run end-to-end pipeline tests
+        env:
+          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          QDRANT_HOST: localhost
+          QDRANT_PORT: 6333
+        run: |
+          if [ -z "$GOOGLE_API_KEY" ]; then
+            echo "⚠️ GOOGLE_API_KEY not set - skipping end-to-end tests"
+            exit 0
+          fi
+
+          echo "🔑 API key available - running full end-to-end tests"
+          python -m pytest tests/pipeline/test_end_to_end.py -v --tb=short -m "requires_api"
+
+      - name: Run comprehensive test suite
+        env:
+          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          QDRANT_HOST: localhost
+          QDRANT_PORT: 6333
+        run: |
+          if [ -n "$GOOGLE_API_KEY" ]; then
+            echo "🚀 Running comprehensive test suite with API"
+            python tests/pipeline/run_tests.py
+          else
+            echo "⚠️ Running minimal test suite without API"
+            python -m pytest tests/pipeline/test_minimal_pipeline.py tests/pipeline/test_components.py -v
+          fi
+
+  test-security:
+    name: Security and Config Validation
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Check for hardcoded secrets
+        run: |
+          # Check that no API keys are hardcoded
+          if grep -r "sk-" . --exclude-dir=.git --exclude="*.md" --exclude="*.yml"; then
+            echo "❌ Found potential hardcoded API keys"
+            exit 1
+          fi
+
+          if grep -r "google_api_key.*=" . --exclude-dir=.git --exclude="*.md" --exclude="*.yml" | grep -v "getenv\|environ"; then
+            echo "❌ Found potential hardcoded Google API keys"
+            exit 1
+          fi
+
+          echo "✅ No hardcoded secrets found"
+
+      - name: Validate configuration structure
+        run: |
+          python -c "
+          import yaml
+
+          # Validate CI config structure
+          with open('pipelines/configs/retrieval/ci_google_gemini.yml') as f:
+              config = yaml.safe_load(f)
+
+          # Check required fields
+          assert 'retrieval_pipeline' in config
+          assert 'retriever' in config['retrieval_pipeline']
+          assert 'embedding' in config['retrieval_pipeline']['retriever']
+          assert 'google' == config['retrieval_pipeline']['retriever']['embedding']['dense']['provider']
+          assert 'GOOGLE_API_KEY' == config['retrieval_pipeline']['retriever']['embedding']['dense']['api_key_env']
+
+          print('✅ Configuration structure is valid')
+          "
diff --git a/.gitignore b/.gitignore
@@ -19,4 +19,12 @@ climate-fever
 *.log
 __pycache__
 sandbox/*
-/__pycache__
+/__pycache__
+synthetic_dataset\text_dataset_template.json
+extraction_output/
+.idea/misc.xml
+.idea/modules.xml
+.idea/Thesis.iml
+.idea/vcs.xml
+.idea/inspectionProfiles/profiles_settings.xml
+*.json
diff --git a/BENCHMARK_OPTIMIZATION_GUIDE.md b/BENCHMARK_OPTIMIZATION_GUIDE.md
@@ -0,0 +1,166 @@
+# RAG Benchmark Optimization System - Usage Guide
+
+## 🎯 Summary
+
+We have successfully created a flexible benchmark optimization system that fixes the critical `external_id` retrieval issue and enables easy parameter optimization experiments.
+
+### ✅ Key Achievements
+
+1. **Fixed External ID Retrieval**: Modified dense retriever to use direct Qdrant API, preserving `external_id` in document metadata
+2. **Excellent Benchmark Results**: 
+   - Precision@5: 75.5%
+   - Recall@5: 69.3%
+   - MRR: 92.0%
+3. **Flexible Configuration System**: Created modular benchmark scenarios for easy optimization
+4. **Ground Truth Integration**: Proper evaluation using real StackOverflow question-answer pairs
+
+## 🚀 Quick Start - Running Benchmarks
+
+### Option 1: Interactive CLI (Easiest)
+```bash
+cd /home/spiros/Desktop/Thesis/Thesis
+python run_benchmark_optimization.py
+```
+
+Then choose:
+- `1` - Quick test (10 queries)
+- `2` - Single scenario 
+- `3` - Run all scenarios
+- `4` - Compare previous results
+
+### Option 2: Command Line
+```bash
+# Run single scenario
+python benchmark_optimizer.py --scenario benchmark_scenarios/quick_test.yml
+
+# Run all scenarios
+python benchmark_optimizer.py --scenarios-dir benchmark_scenarios
+
+# Compare existing results only
+python benchmark_optimizer.py --compare-only
+```
+
+## 📊 Available Optimization Scenarios
+
+Located in `benchmark_scenarios/`:
+
+1. **quick_test.yml** - Fast 10-query test for rapid iteration
+2. **dense_baseline.yml** - Dense retrieval with top_k=10, threshold=0.1
+3. **dense_high_recall.yml** - Dense with top_k=20, threshold=0.05 (more results)
+4. **dense_high_precision.yml** - Dense with threshold=0.3 (stricter filtering)
+5. **sparse_bm25.yml** - Sparse BM25 retrieval
+6. **hybrid_retrieval.yml** - Combined dense + sparse retrieval
+
+## 🔧 Creating Custom Scenarios
+
+Create new `.yml` files in `benchmark_scenarios/` with this structure:
+
+```yaml
+# Description of the experiment
+description: "Your experiment description"
+
+# Dataset configuration
+dataset:
+  path: "/home/spiros/Desktop/Thesis/datasets/sosum/data"
+  use_ground_truth: true
+
+# Retrieval configuration  
+retrieval:
+  type: "dense"  # dense, sparse, or hybrid
+  top_k: 10
+  score_threshold: 0.1
+
+# Embedding configuration (override main config)
+embedding:
+  dense:
+    provider: google
+    model: models/embedding-001
+    dimensions: 768
+    api_key_env: GOOGLE_API_KEY
+    batch_size: 32
+    vector_name: dense
+  strategy: dense
+
+# Evaluation configuration
+evaluation:
+  k_values: [1, 5, 10]
+  metrics:
+    retrieval: ["precision@k", "recall@k", "mrr", "ndcg@k"]
+
+# Experiment parameters
+max_queries: 50
+experiment_name: "your_experiment_name"
+```
+
+## 📈 Optimization Parameters You Can Tune
+
+### Retrieval Parameters
+- `top_k`: Number of documents to retrieve (5, 10, 15, 20)
+- `score_threshold`: Minimum similarity score (0.0, 0.1, 0.2, 0.3)
+- `type`: Retrieval strategy (dense, sparse, hybrid)
+
+### Embedding Parameters
+- `model`: Different embedding models
+- `batch_size`: Processing batch size (16, 32, 64)
+- `dimensions`: Embedding dimensions (384, 768, 1024)
+
+### Evaluation Parameters  
+- `max_queries`: Dataset size (10, 25, 50, 100, 500)
+- `k_values`: Evaluation depths ([1,5,10], [1,5,10,20])
+
+## 🏆 Results Analysis
+
+The system automatically:
+- Tracks all experiment results
+- Compares scenarios across metrics
+- Identifies best performers for each metric
+- Saves results to `benchmark_optimization_results.yml`
+
+### Key Metrics
+- **Precision@K**: How many retrieved docs are relevant
+- **Recall@K**: How many relevant docs were retrieved  
+- **MRR**: Mean Reciprocal Rank (position of first relevant result)
+- **NDCG@K**: Normalized Discounted Cumulative Gain
+
+## 🔍 Example Optimization Workflow
+
+1. **Start with quick test**:
+   ```bash
+   python benchmark_optimizer.py --scenario benchmark_scenarios/quick_test.yml
+   ```
+
+2. **Run baseline experiments**:
+   ```bash
+   python benchmark_optimizer.py --scenarios-dir benchmark_scenarios
+   ```
+
+3. **Create custom scenarios** based on baseline results
+
+4. **Compare all results**:
+   ```bash
+   python benchmark_optimizer.py --compare-only
+   ```
+
+## 📊 Current Best Configuration
+
+Based on our tests, the current best performing setup:
+- **Retrieval**: Dense with Google Gemini embeddings
+- **Top K**: 10 documents
+- **Score Threshold**: 0.1 (from config, but score filter at 0.3)
+- **Reranking**: Cross-encoder reranking with ms-marco-MiniLM-L-6-v2
+- **Results**: 75.5% Precision@5, 69.3% Recall@5, 92% MRR
+
+## 🚨 Important Notes
+
+1. **Ground Truth**: System uses real StackOverflow question-answer pairs for evaluation
+2. **External ID Fix**: Our custom dense retriever preserves document IDs correctly
+3. **Scalability**: Adjust `max_queries` based on time constraints
+4. **Consistency**: All scenarios use the same evaluation methodology for fair comparison
+
+## 🎯 Next Steps for Optimization
+
+1. **Hyperparameter Tuning**: Create scenarios with different top_k and threshold values
+2. **Embedding Models**: Test different embedding providers/models
+3. **Hybrid Strategies**: Optimize dense+sparse combination weights
+4. **Reranking**: Experiment with different reranker models
+5. **Dataset Size**: Scale up to full 506 questions for final evaluation