Restructure RAGAS evaluation;

silasvb · silasvb · commit f14b54a8eb93 · 2025-10-15T14:51:24.000+01:00
diff --git a/backend/tests/Ragas/utils/README.md b/backend/tests/Ragas/utils/README.md
@@ -68,13 +68,10 @@ python enhanced_run_evaluation_pipeline.py /path/to/your/document.pdf 3
 After running the enhanced pipeline which produces the `ragas_evaluation_with_responses.jsonl` file, you must run the RAGAS evaluation script:
 
 ```bash
-python ragas_evaluate.py
+python ragas_evaluate.py --llm 'name-of-llm'
 
 # With custom input/output paths
-python ragas_evaluate.py --input path/to/input.jsonl --output path/to/output.json
-
-# Skip chart generation
-python ragas_evaluate.py --no-chart
+python ragas_evaluate.py --llm 'name-of-llm' --input path/to/input.jsonl --output path/to/output.csv
 ```
 
 ## Environment Variables
@@ -116,8 +113,7 @@ All output files are stored in `../files/`:
 
 - `ragas_evaluation_dataset.jsonl`: Initial questions and references
 - `ragas_evaluation_with_responses.jsonl`: Questions with API responses
-- `ragas_eval_result.json`: Evaluation metrics as configured in RAGAS_METRICS (default: factual_correctness, semantic_similarity, answer_accuracy)
-- `ragas_eval_result_chart.png`: Visualization of evaluation results
+- `ragas_eval_result.csv`: CSV of evaluation metrics, which is appended to on each run. Includes a column for the LLM name to support retrospective graph generation.
 
 ## Troubleshooting
 
diff --git a/backend/tests/Ragas/utils/modules/ragas_evaluation.py b/backend/tests/Ragas/utils/modules/ragas_evaluation.py
@@ -6,7 +6,6 @@
 
 import os
 from pathlib import Path
-from typing import Optional
 import pandas as pd
 from ragas import evaluate, EvaluationDataset, SingleTurnSample
 from ragas.llms import LangchainLLMWrapper
@@ -15,6 +14,8 @@
 from ragas.embeddings import LangchainEmbeddingsWrapper
 from langchain_openai import OpenAIEmbeddings
 from dotenv import load_dotenv
+from .ragas_utils import load_jsonl_data
+
 
 
 # Find the project root (where .env is located)
@@ -119,23 +120,17 @@ def create_ragas_llm():
     return LangchainLLMWrapper(chat_model), ragas_embeddings
 
 
-async def evaluate_with_ragas(
-    jsonl_path: str, output_json_path: Optional[str] = None, skip_chart: bool = False
-) -> pd.DataFrame:
+async def evaluate_with_ragas(jsonl_path: str) -> pd.DataFrame:
     """
     Evaluate responses using RAGAS metrics
 
     Args:
         jsonl_path: Path to the input JSONL file with responses
-        output_json_path: Path to save the JSON results
-        skip_chart: Whether to skip generating the bar chart
 
     Returns:
         DataFrame with evaluation results
     """
     # Import locally to avoid circular imports
-    from .ragas_utils import load_jsonl_data, save_results_to_json
-    from .ragas_visualization import generate_bar_chart
 
     print("Setting up RAGAS evaluation...")
     print(f"Loading data from {jsonl_path}...")
@@ -198,7 +193,8 @@ async def evaluate_with_ragas(
                     row_dict[mapped] = df.loc[idx, raw]
                 # Attach llm_usage if supplied in original input sample
                 if "llm_usage" in processed_data[idx]:
-                    row_dict["llm_usage"] = processed_data[idx]["llm_usage"]
+                    for key, val in processed_data[idx]["llm_usage"].items():
+                        row_dict[key] = val
                 rows.append(row_dict)
 
             results_df = pd.DataFrame(rows)
@@ -232,20 +228,6 @@ async def evaluate_with_ragas(
             print(f"Could not process RAGAS results with llm_usage: {e}")
             raise
 
-        # Save results and generate visualization
-        if output_json_path:
-            # Save results to JSON
-            save_results_to_json(results_df, output_json_path)
-
-            # Generate visualization if not disabled
-            if not skip_chart:
-                try:
-                    chart_path = generate_bar_chart(output_json_path)
-                    if chart_path:
-                        print(f"Chart generated: {chart_path}")
-                except Exception as e:
-                    print(f"Chart generation failed: {e}")
-
         return results_df
 
     except Exception as e:
diff --git a/backend/tests/Ragas/utils/ragas_evaluate.py b/backend/tests/Ragas/utils/ragas_evaluate.py
@@ -9,33 +9,30 @@
 import sys
 import argparse
 import asyncio
-import pandas as pd
 from modules.ragas_evaluation import evaluate_with_ragas
 
 
-async def run_evaluation(input_path: str, output_path: str, skip_chart: bool = False) -> pd.DataFrame:
+async def run_evaluation(input_path: str, output_path: str, llm: str) -> None:
     """
     Run the RAGAS evaluation process end-to-end
 
     Args:
         input_path: Path to input JSONL file with responses
         output_path: Path to save JSON output results
-        skip_chart: Whether to skip chart generation
-
-    Returns:
-        DataFrame with evaluation results
+        llm: The LLM model to use for evaluation
     """
 
     # Run RAGAS evaluation
     print(f"Running RAGAS evaluation on {input_path}...")
-    results_df = await evaluate_with_ragas(input_path, output_path, skip_chart)
+    results_df = await evaluate_with_ragas(input_path)
+    results_df["llm"] = llm
+    print("RAGAS evaluation completed.")
+    print(f"Appending results to CSV file... {output_path}")
 
-    print(f"Evaluation complete! Results saved to {output_path}")
-    if not skip_chart:
-        chart_path = output_path.replace(".json", "_chart.png")
-        print(f"Chart saved to {chart_path}")
+    file_exists = os.path.isfile(output_path)
+    results_df.to_csv(output_path, mode='a', header=not file_exists, index=False)
 
-    return results_df
+    print("Results appended to CSV file.")
 
 
 async def main():
@@ -44,12 +41,13 @@ async def main():
     # Set up default file paths
     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # Navigate to Ragas root
     default_input = os.path.normpath(os.path.join(base_dir, "files/ragas_evaluation_with_responses.jsonl"))
-    default_output = os.path.normpath(os.path.join(base_dir, "files/ragas_eval_result.json"))
+    default_output = os.path.normpath(os.path.join(base_dir, "files/ragas_eval_result.csv"))
 
     # Parse command line arguments
     parser = argparse.ArgumentParser(description="Evaluate responses using RAGAS metrics")
+    parser.add_argument("--llm", "-l", help="LLM model to use for evaluation")
     parser.add_argument("--input", "-i", dest="input_jsonl", help="Path to input JSONL file", default=default_input)
-    parser.add_argument("--output", "-o", help="Path to save JSON output", default=default_output)
+    parser.add_argument("--output", "-o", help="Path to save CSV output", default=default_output)
     parser.add_argument("--no-chart", action="store_true", help="Skip chart visualization")
     args = parser.parse_args()
 
@@ -60,12 +58,11 @@ async def main():
 
     print(f"Input file: {args.input_jsonl}")
     print(f"Output file: {args.output}")
-    if args.no_chart:
-        print("Chart generation is disabled")
+    print(f"LLM model: {args.llm}")
 
     # Run evaluation
     try:
-        await run_evaluation(args.input_jsonl, args.output, args.no_chart)
+        await run_evaluation(args.input_jsonl, args.output, args.llm)
     except Exception as e:
         print(f"Error during evaluation: {str(e)}")
         sys.exit(1)