From 491ad89b90195ae099368a583fcb1a3a9b6746f2 Mon Sep 17 00:00:00 2001
From: manavgup <manavg@gmail.com>
Date: Thu, 23 Oct 2025 00:01:10 -0400
Subject: [PATCH] fix: Resolve reranking variable mismatch and enable config
 overrides (Issue #465)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit fixes three critical issues preventing reranking from working:

## 1. Fix Variable Name Mismatch in Reranker (PRIMARY FIX)
- **Problem**: Reranking scores always 0.0 because LLM prompts had unfilled placeholders
- **Root cause**: Reranker used "document" variable, WatsonX provider expected "context"
- **Fix**: Changed variable name from "document" to "context" in reranker.py
- **Impact**: Reranking now generates valid scores (0.0-1.0 range)
- **Files**: backend/rag_solution/retrieval/reranker.py:140,174

## 2. Fix Backend Startup Error (BLOCKING BUG)
- **Problem**: Backend failed to start with "'dict' object has no attribute 'model_dump'"
- **Root cause**: system_initialization_service.py passed dict instead of Pydantic model
- **Fix**: Convert LLMProviderInput to LLMProviderUpdate before passing to update_provider()
- **Impact**: Backend starts successfully, hot-reload works
- **Files**: backend/rag_solution/services/system_initialization_service.py:106-107

## 3. Enable Per-Request Reranking Config (FEATURE ENHANCEMENT)
- **Problem**: No way to enable/disable reranking per search request
- **Root cause**: Config metadata not passed to _apply_reranking method
- **Fix**: Added config_metadata parameter to enable per-request reranking control
- **Impact**: Tests can now compare with/without reranking using config_metadata
- **Files**: backend/rag_solution/services/search_service.py:239-262,695-700,939-944

## Test Results

**Before Fix:**
- Reranking scores: always 0.0
- Backend startup: failed with Pydantic error
- Config override: not supported

**After Fix:**
- Reranking scores: 0.0-1.0 range (e.g., top score changed from 0.6278 to 0.8000)
- Backend startup: successful with hot-reload
- Config override: enabled via config_metadata.enable_reranking

## Test Artifacts

Added two test scripts:
1. **test_reranking_impact.py**: Demonstrates reranking effect by comparing scores
2. **create_reranking_template.py**: Creates RERANKING template in database

## Database Change

Created reranking template directly in database:
```sql
INSERT INTO prompt_templates (...)
VALUES ('Default Reranking Template', 'RERANKING', ...)
```

## Known Limitation

Reranking can only improve ranking of chunks that are already retrieved. If the
embedding model doesn't retrieve relevant chunks in initial top_k, reranking
cannot bring them in. This is a separate embedding model quality issue.

## Related Issues

Closes #465 (reranking not working, scores always 0.0)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../dev_tests/create_reranking_template.py    |  33 +++
 backend/dev_tests/test_reranking_impact.py    | 211 ++++++++++++++++++
 backend/rag_solution/retrieval/reranker.py    |   8 +-
 .../rag_solution/services/search_service.py   |  17 +-
 .../services/system_initialization_service.py |   8 +-
 backend/tests/unit/test_reranker.py           |   2 +-
 6 files changed, 271 insertions(+), 8 deletions(-)
 create mode 100644 backend/dev_tests/create_reranking_template.py
 create mode 100644 backend/dev_tests/test_reranking_impact.py

diff --git a/backend/dev_tests/create_reranking_template.py b/backend/dev_tests/create_reranking_template.py
new file mode 100644
index 00000000..be6b082a
--- /dev/null
+++ b/backend/dev_tests/create_reranking_template.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+"""Create default reranking template."""
+
+import requests
+
+API_BASE = "http://localhost:8000"
+USER_UUID = "ee76317f-3b6f-4fea-8b74-56483731f58c"
+
+template_data = {
+    "name": "Default Reranking Template",
+    "template_type": "RERANKING",
+    "template_format": "Rate the relevance of this document to the query on a scale of 0-{scale}:\n\nQuery: {query}\n\nDocument: {context}\n\nRelevance score:",
+    "input_variables": {"query": "The search query", "context": "The document text", "scale": "Score scale (e.g., 10)"},
+    "is_default": True,
+    "max_context_length": 4000,
+}
+
+print("Creating reranking template...")
+response = requests.post(
+    f"{API_BASE}/api/users/{USER_UUID}/prompt-templates",
+    headers={
+        "Content-Type": "application/json",
+        "X-User-UUID": USER_UUID,
+    },
+    json=template_data,
+)
+
+if response.status_code == 200:
+    print("✅ Template created successfully!")
+    print(response.json())
+else:
+    print(f"❌ Failed: {response.status_code}")
+    print(response.text)
diff --git a/backend/dev_tests/test_reranking_impact.py b/backend/dev_tests/test_reranking_impact.py
new file mode 100644
index 00000000..55eafeb0
--- /dev/null
+++ b/backend/dev_tests/test_reranking_impact.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""
+Test reranking impact on search results.
+
+Compares search results with and without reranking to demonstrate
+the fix for Issue #465 (reranking variable name mismatch).
+"""
+
+import asyncio
+import os
+import sys
+
+# Add backend to path
+sys.path.insert(0, os.path.dirname(__file__))
+
+import httpx
+from dotenv import load_dotenv
+
+# Test configuration
+TEST_FILE = "/Users/mg/Downloads/2021-ibm-annual-report.txt"  # Using 2021 for existing collections
+TEST_QUERY = "What was IBM revenue in 2021?"
+EXPECTED_ANSWER = "57.4 billion"  # 2021 revenue
+
+# API configuration
+API_BASE = "http://localhost:8000"
+USER_UUID = "ee76317f-3b6f-4fea-8b74-56483731f58c"
+
+# Use existing collection from embedding comparison test
+# Collections are named like: test-slate-125m-english-rtrvr-{random}
+COLLECTION_PREFIX = "test-slate-125m-english-rtrvr"
+
+
+async def search_with_config(collection_id: str, query: str, enable_reranking: bool, top_k: int = 20) -> dict:
+    """Run search query with specific reranking configuration."""
+    config_label = "WITH reranking" if enable_reranking else "WITHOUT reranking"
+    print(f"\n{'=' * 80}")
+    print(f"🔍 Testing {config_label}")
+    print(f"{'=' * 80}")
+
+    async with httpx.AsyncClient(timeout=180.0) as client:
+        response = await client.post(
+            f"{API_BASE}/api/search",
+            headers={
+                "Content-Type": "application/json",
+                "X-User-UUID": USER_UUID,
+            },
+            json={
+                "question": query,
+                "collection_id": collection_id,
+                "user_id": USER_UUID,
+                "config_metadata": {
+                    "cot_disabled": True,  # Disable CoT to isolate reranking effect
+                    "top_k": top_k,
+                    "enable_reranking": enable_reranking,  # Explicit control
+                },
+            },
+        )
+
+        if response.status_code == 200:
+            return response.json()
+        else:
+            raise Exception(f"Search failed: {response.status_code} - {response.text}")
+
+
+def analyze_results(results: dict, expected_text: str, label: str) -> dict:
+    """Analyze search results to find where the expected chunk ranks."""
+    query_results = results.get("query_results", [])
+    answer = results.get("answer", "")
+
+    # Find revenue chunk
+    revenue_chunk_rank = None
+    revenue_chunk_score = None
+    revenue_chunk_text = None
+
+    for i, result in enumerate(query_results, 1):
+        chunk_text = result["chunk"]["text"]
+        if expected_text in chunk_text:
+            revenue_chunk_rank = i
+            revenue_chunk_score = result["score"]
+            revenue_chunk_text = chunk_text[:200]
+            break
+
+    # Check if answer contains expected text
+    answer_correct = expected_text in answer
+
+    analysis = {
+        "label": label,
+        "revenue_chunk_rank": revenue_chunk_rank,
+        "revenue_chunk_score": revenue_chunk_score,
+        "revenue_chunk_text": revenue_chunk_text,
+        "answer_correct": answer_correct,
+        "total_results": len(query_results),
+    }
+
+    # Print results
+    print(f"\n📊 Results {label}:")
+    print(f"   Total chunks returned: {len(query_results)}")
+
+    if revenue_chunk_rank:
+        print(f"   ✅ Revenue chunk found at rank: #{revenue_chunk_rank}")
+        print(f"   📈 Score: {revenue_chunk_score:.4f}")
+        print(f"   📝 Text preview: {revenue_chunk_text[:100]}...")
+    else:
+        print(f"   ❌ Revenue chunk NOT found in top {len(query_results)} results")
+
+    print(f"   {'✅' if answer_correct else '❌'} Answer contains '{expected_text}': {answer_correct}")
+
+    # Show top 5 chunks with scores
+    print(f"\n   Top 5 chunks {label}:")
+    for i, result in enumerate(query_results[:5], 1):
+        chunk_text = result["chunk"]["text"][:80]
+        score = result["score"]
+        is_revenue = "🎯 REVENUE" if expected_text in result["chunk"]["text"] else ""
+        print(f"   {i:2d}. Score: {score:.4f} - {chunk_text}... {is_revenue}")
+
+    return analysis
+
+
+async def get_or_create_collection():
+    """Get existing collection from embedding comparison test."""
+    print(f"🔍 Looking for collection starting with: {COLLECTION_PREFIX}")
+
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        # List collections
+        response = await client.get(
+            f"{API_BASE}/api/collections",
+            headers={"X-User-UUID": USER_UUID},
+        )
+
+        if response.status_code == 200:
+            collections = response.json()
+            matching_collections = [col for col in collections if col["name"].startswith(COLLECTION_PREFIX)]
+
+            if matching_collections:
+                # Find first completed/ready collection
+                for col in matching_collections:
+                    if col.get("status") in ["completed", "ready"]:
+                        print(f"✅ Found collection: {col['name']} (ID: {col['id']}) - Status: {col['status']}")
+                        return col["id"]
+
+                # If no completed collections, use first one but warn
+                col = matching_collections[0]
+                print(f"⚠️  Found collection: {col['name']} (ID: {col['id']}) - Status: {col.get('status', 'unknown')}")
+                return col["id"]
+
+        print(f"❌ No collection found starting with '{COLLECTION_PREFIX}'")
+        print("   Please run test_embedding_comparison.py first to create test collections")
+        print("   or manually create a collection with the IBM Slate 125M model")
+        return None
+
+
+async def main():
+    """Run reranking comparison test."""
+    print("=" * 80)
+    print("🔬 RERANKING IMPACT TEST")
+    print("=" * 80)
+    print(f"Query: '{TEST_QUERY}'")
+    print(f"Expected text: '{EXPECTED_ANSWER}'")
+
+    # Get collection
+    collection_id = await get_or_create_collection()
+    if not collection_id:
+        return
+
+    # Test WITHOUT reranking
+    results_no_rerank = await search_with_config(collection_id, TEST_QUERY, enable_reranking=False, top_k=20)
+    analysis_no_rerank = analyze_results(results_no_rerank, EXPECTED_ANSWER, "WITHOUT reranking")
+
+    # Test WITH reranking
+    results_with_rerank = await search_with_config(collection_id, TEST_QUERY, enable_reranking=True, top_k=20)
+    analysis_with_rerank = analyze_results(results_with_rerank, EXPECTED_ANSWER, "WITH reranking")
+
+    # Compare results
+    print("\n" + "=" * 80)
+    print("📊 COMPARISON SUMMARY")
+    print("=" * 80)
+
+    rank_no_rerank = analysis_no_rerank["revenue_chunk_rank"]
+    rank_with_rerank = analysis_with_rerank["revenue_chunk_rank"]
+
+    if rank_no_rerank and rank_with_rerank:
+        improvement = rank_no_rerank - rank_with_rerank
+        print("\n🎯 Revenue Chunk Ranking:")
+        print(f"   WITHOUT reranking: #{rank_no_rerank}")
+        print(f"   WITH reranking:    #{rank_with_rerank}")
+
+        if improvement > 0:
+            print(f"   ✅ Reranking IMPROVED ranking by {improvement} positions! 🎉")
+        elif improvement < 0:
+            print(f"   ⚠️  Reranking worsened ranking by {abs(improvement)} positions")
+        else:
+            print("   ➡️  No change in ranking")
+
+        # Show score change
+        score_no_rerank = analysis_no_rerank["revenue_chunk_score"]
+        score_with_rerank = analysis_with_rerank["revenue_chunk_score"]
+        print("\n📈 Revenue Chunk Score:")
+        print(f"   WITHOUT reranking: {score_no_rerank:.4f}")
+        print(f"   WITH reranking:    {score_with_rerank:.4f}")
+        print(f"   Change: {score_with_rerank - score_no_rerank:+.4f}")
+    else:
+        print("\n❌ Could not compare - revenue chunk not found in one or both result sets")
+
+    print("\n" + "=" * 80)
+    print("✅ Test complete!")
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    asyncio.run(main())
diff --git a/backend/rag_solution/retrieval/reranker.py b/backend/rag_solution/retrieval/reranker.py
index 969bdf58..52db1024 100644
--- a/backend/rag_solution/retrieval/reranker.py
+++ b/backend/rag_solution/retrieval/reranker.py
@@ -126,6 +126,10 @@ def _create_reranking_prompts(self, query: str, results: list[QueryResult]) -> l
 
         Returns:
             List of variable dictionaries for prompt formatting.
+
+        Note:
+            Uses "context" as the variable name for document text to match
+            the WatsonX provider's batch generation implementation.
         """
         prompts = []
         for result in results:
@@ -133,7 +137,7 @@ def _create_reranking_prompts(self, query: str, results: list[QueryResult]) -> l
                 continue
             prompt_vars = {
                 "query": query,
-                "document": result.chunk.text,
+                "context": result.chunk.text,  # Changed from "document" to "context"
                 "scale": str(self.score_scale),
             }
             prompts.append(prompt_vars)
@@ -167,7 +171,7 @@ def _score_documents(self, query: str, results: list[QueryResult]) -> list[tuple
                 for prompt_vars in batch_prompts:
                     # The template formatting is handled by the LLM provider
                     # We'll pass the document text as the "context" for the template
-                    formatted_prompts.append(prompt_vars["document"])
+                    formatted_prompts.append(prompt_vars["context"])
 
                 # Call LLM with batch of prompts
                 responses = self.llm_provider.generate_text(
diff --git a/backend/rag_solution/services/search_service.py b/backend/rag_solution/services/search_service.py
index 5427fc17..8cffdb82 100644
--- a/backend/rag_solution/services/search_service.py
+++ b/backend/rag_solution/services/search_service.py
@@ -236,18 +236,29 @@ def get_reranker(self, user_id: UUID4) -> Any:
 
         return self._reranker
 
-    def _apply_reranking(self, query: str, results: list[QueryResult], user_id: UUID4) -> list[QueryResult]:
+    def _apply_reranking(
+        self, query: str, results: list[QueryResult], user_id: UUID4, config_metadata: dict | None = None
+    ) -> list[QueryResult]:
         """Apply reranking to search results if enabled.
 
         Args:
             query: The search query
             results: List of QueryResult objects from retrieval
             user_id: User UUID
+            config_metadata: Optional config dict that can override enable_reranking
 
         Returns:
             Reranked list of QueryResult objects (or original if reranking disabled/failed)
         """
-        if not self.settings.enable_reranking or not results:
+        # Check for config override first, then fall back to settings
+        enable_reranking = (
+            config_metadata.get("enable_reranking", self.settings.enable_reranking)
+            if config_metadata
+            else self.settings.enable_reranking
+        )
+
+        if not enable_reranking or not results:
+            logger.debug("Reranking disabled (enable_reranking=%s), returning original results", enable_reranking)
             return results
 
         try:
@@ -685,6 +696,7 @@ async def search(self, search_input: SearchInput) -> SearchOutput:
                             query=search_input.question,
                             results=pipeline_result.query_results,
                             user_id=search_input.user_id,
+                            config_metadata=search_input.config_metadata,
                         )
                     # Convert to CoT input with document context
                     try:
@@ -928,6 +940,7 @@ async def search(self, search_input: SearchInput) -> SearchOutput:
                 query=search_input.question,
                 results=pipeline_result.query_results,
                 user_id=search_input.user_id,
+                config_metadata=search_input.config_metadata,
             )
 
         # Generate metadata
diff --git a/backend/rag_solution/services/system_initialization_service.py b/backend/rag_solution/services/system_initialization_service.py
index 68aca90d..8682412a 100644
--- a/backend/rag_solution/services/system_initialization_service.py
+++ b/backend/rag_solution/services/system_initialization_service.py
@@ -102,9 +102,11 @@ def _initialize_single_provider(
         try:
             if existing_provider:
                 logger.info(f"Updating provider: {name}")
-                provider = self.llm_provider_service.update_provider(
-                    existing_provider.id, config.model_dump(exclude_unset=True)
-                )
+                # Convert LLMProviderInput to LLMProviderUpdate
+                from rag_solution.schemas.llm_provider_schema import LLMProviderUpdate
+
+                updates = LLMProviderUpdate(**config.model_dump(exclude_unset=True))
+                provider = self.llm_provider_service.update_provider(existing_provider.id, updates)
                 if not provider:
                     raise LLMProviderError(name, "update", f"Failed to update {name}")
             else:
diff --git a/backend/tests/unit/test_reranker.py b/backend/tests/unit/test_reranker.py
index d31ea9d3..2d06b7b8 100644
--- a/backend/tests/unit/test_reranker.py
+++ b/backend/tests/unit/test_reranker.py
@@ -168,7 +168,7 @@ def test_create_reranking_prompts(
 
         assert len(prompts) == 3
         assert prompts[0]["query"] == "machine learning"
-        assert prompts[0]["document"] == "Machine learning is a subset of artificial intelligence."
+        assert prompts[0]["context"] == "Machine learning is a subset of artificial intelligence."
         assert prompts[0]["scale"] == "10"
 
     def test_create_reranking_prompts_skips_none_chunks(