Skip to content

Commit 135313f

Browse files
OSAI-101 - Restructure RAGAS evaluation (#39)
* Restructure RAGAS evaluation; * Reformat code * REmove no-chart option * Add visualisation workbook * Updated workbook for visualisation * new metrics * ruff format --------- Co-authored-by: SCOTT\afonseca <[email protected]>
1 parent 3218fb1 commit 135313f

File tree

5 files changed

+575
-61
lines changed

5 files changed

+575
-61
lines changed

backend/tests/Ragas/utils/README.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,10 @@ python enhanced_run_evaluation_pipeline.py /path/to/your/document.pdf 3
6868
After running the enhanced pipeline which produces the `ragas_evaluation_with_responses.jsonl` file, you must run the RAGAS evaluation script:
6969

7070
```bash
71-
python ragas_evaluate.py
71+
python ragas_evaluate.py --llm 'name-of-llm'
7272

7373
# With custom input/output paths
74-
python ragas_evaluate.py --input path/to/input.jsonl --output path/to/output.json
75-
76-
# Skip chart generation
77-
python ragas_evaluate.py --no-chart
74+
python ragas_evaluate.py --llm 'name-of-llm' --input path/to/input.jsonl --output path/to/output.csv
7875
```
7976

8077
## Environment Variables
@@ -116,8 +113,7 @@ All output files are stored in `../files/`:
116113

117114
- `ragas_evaluation_dataset.jsonl`: Initial questions and references
118115
- `ragas_evaluation_with_responses.jsonl`: Questions with API responses
119-
- `ragas_eval_result.json`: Evaluation metrics as configured in RAGAS_METRICS (default: factual_correctness, semantic_similarity, answer_accuracy)
120-
- `ragas_eval_result_chart.png`: Visualization of evaluation results
116+
- `ragas_eval_result.csv`: CSV of evaluation metrics, which is appended to on each run. Includes a column for the LLM name to support retrospective graph generation.
121117

122118
## Troubleshooting
123119

backend/tests/Ragas/utils/modules/ragas_evaluation.py

Lines changed: 10 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66

77
import os
88
from pathlib import Path
9-
from typing import Optional
109
import pandas as pd
1110
from ragas import evaluate, EvaluationDataset, SingleTurnSample
1211
from ragas.llms import LangchainLLMWrapper
1312
from langchain_openai.chat_models import ChatOpenAI
14-
from ragas.metrics import answer_relevancy, ContextRelevance, SemanticSimilarity, context_precision
13+
from ragas.metrics import AnswerAccuracy, SemanticSimilarity, FactualCorrectness
1514
from ragas.embeddings import LangchainEmbeddingsWrapper
1615
from langchain_openai import OpenAIEmbeddings
1716
from dotenv import load_dotenv
17+
from .ragas_utils import load_jsonl_data
1818

1919

2020
# Find the project root (where .env is located)
@@ -67,7 +67,6 @@ def create_ragas_dataset(data):
6767
# Create a sample using the RAGAS SingleTurnSample class
6868
eval_sample = SingleTurnSample(
6969
user_input=sample.get("user_input", ""),
70-
retrieved_contexts=[context for context in sample.get("reference_contexts", []) if context],
7170
response=sample.get("response", ""),
7271
reference=reference, # Use either provided reference or first context
7372
)
@@ -119,23 +118,17 @@ def create_ragas_llm():
119118
return LangchainLLMWrapper(chat_model), ragas_embeddings
120119

121120

122-
async def evaluate_with_ragas(
123-
jsonl_path: str, output_json_path: Optional[str] = None, skip_chart: bool = False
124-
) -> pd.DataFrame:
121+
async def evaluate_with_ragas(jsonl_path: str) -> pd.DataFrame:
125122
"""
126123
Evaluate responses using RAGAS metrics
127124
128125
Args:
129126
jsonl_path: Path to the input JSONL file with responses
130-
output_json_path: Path to save the JSON results
131-
skip_chart: Whether to skip generating the bar chart
132127
133128
Returns:
134129
DataFrame with evaluation results
135130
"""
136131
# Import locally to avoid circular imports
137-
from .ragas_utils import load_jsonl_data, save_results_to_json
138-
from .ragas_visualization import generate_bar_chart
139132

140133
print("Setting up RAGAS evaluation...")
141134
print(f"Loading data from {jsonl_path}...")
@@ -150,28 +143,22 @@ async def evaluate_with_ragas(
150143
dataset, samples, processed_data = create_ragas_dataset(data)
151144

152145
# Define metrics to use for evaluation
153-
print(
154-
"Configuring default RAGAS metrics: semantic_similarity, "
155-
"answer_relevancy, context_relevance, context_precision"
156-
)
146+
print("Configuring default RAGAS metrics: semantic_similarity,factual_correctness, answer_accuracy")
157147
metrics = [
158148
SemanticSimilarity(),
159-
answer_relevancy,
160-
context_precision,
161-
ContextRelevance(llm=llm),
149+
FactualCorrectness(llm=llm),
150+
AnswerAccuracy(llm=llm),
162151
]
163152

164153
# Run the evaluation
165154
print("Running RAGAS evaluation (this may take a while)...")
166155
results = evaluate(dataset=dataset, metrics=metrics, llm=llm)
167-
168156
try:
169157
print("Processing evaluation results including llm_usage if present...")
170158
# Define expected metrics for alignment and output naming
171159
expected_metrics = [
172-
("nv_context_relevance", "recontext_relevance"),
173-
("context_precision", "context_precision"),
174-
("answer_relevancy", "answer_relevancy"),
160+
("factual_correctness(mode=f1)", "factual_correctness"),
161+
("nv_accuracy", "answer_accuracy"),
175162
("semantic_similarity", "semantic_similarity"),
176163
]
177164

@@ -198,7 +185,8 @@ async def evaluate_with_ragas(
198185
row_dict[mapped] = df.loc[idx, raw]
199186
# Attach llm_usage if supplied in original input sample
200187
if "llm_usage" in processed_data[idx]:
201-
row_dict["llm_usage"] = processed_data[idx]["llm_usage"]
188+
for key, val in processed_data[idx]["llm_usage"].items():
189+
row_dict[key] = val
202190
rows.append(row_dict)
203191

204192
results_df = pd.DataFrame(rows)
@@ -232,20 +220,6 @@ async def evaluate_with_ragas(
232220
print(f"Could not process RAGAS results with llm_usage: {e}")
233221
raise
234222

235-
# Save results and generate visualization
236-
if output_json_path:
237-
# Save results to JSON
238-
save_results_to_json(results_df, output_json_path)
239-
240-
# Generate visualization if not disabled
241-
if not skip_chart:
242-
try:
243-
chart_path = generate_bar_chart(output_json_path)
244-
if chart_path:
245-
print(f"Chart generated: {chart_path}")
246-
except Exception as e:
247-
print(f"Chart generation failed: {e}")
248-
249223
return results_df
250224

251225
except Exception as e:

backend/tests/Ragas/utils/ragas_evaluate.py

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,30 @@
99
import sys
1010
import argparse
1111
import asyncio
12-
import pandas as pd
1312
from modules.ragas_evaluation import evaluate_with_ragas
1413

1514

16-
async def run_evaluation(input_path: str, output_path: str, skip_chart: bool = False) -> pd.DataFrame:
15+
async def run_evaluation(input_path: str, output_path: str, llm: str) -> None:
1716
"""
1817
Run the RAGAS evaluation process end-to-end
1918
2019
Args:
2120
input_path: Path to input JSONL file with responses
2221
output_path: Path to save JSON output results
23-
skip_chart: Whether to skip chart generation
24-
25-
Returns:
26-
DataFrame with evaluation results
22+
llm: The LLM model to use for evaluation
2723
"""
2824

2925
# Run RAGAS evaluation
3026
print(f"Running RAGAS evaluation on {input_path}...")
31-
results_df = await evaluate_with_ragas(input_path, output_path, skip_chart)
27+
results_df = await evaluate_with_ragas(input_path)
28+
results_df["llm"] = llm
29+
print("RAGAS evaluation completed.")
30+
print(f"Appending results to CSV file... {output_path}")
3231

33-
print(f"Evaluation complete! Results saved to {output_path}")
34-
if not skip_chart:
35-
chart_path = output_path.replace(".json", "_chart.png")
36-
print(f"Chart saved to {chart_path}")
32+
file_exists = os.path.isfile(output_path)
33+
results_df.to_csv(output_path, mode="a", header=not file_exists, index=False)
3734

38-
return results_df
35+
print("Results appended to CSV file.")
3936

4037

4138
async def main():
@@ -44,13 +41,13 @@ async def main():
4441
# Set up default file paths
4542
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Navigate to Ragas root
4643
default_input = os.path.normpath(os.path.join(base_dir, "files/ragas_evaluation_with_responses.jsonl"))
47-
default_output = os.path.normpath(os.path.join(base_dir, "files/ragas_eval_result.json"))
44+
default_output = os.path.normpath(os.path.join(base_dir, "files/ragas_eval_result.csv"))
4845

4946
# Parse command line arguments
5047
parser = argparse.ArgumentParser(description="Evaluate responses using RAGAS metrics")
48+
parser.add_argument("--llm", "-l", help="LLM model to use for evaluation")
5149
parser.add_argument("--input", "-i", dest="input_jsonl", help="Path to input JSONL file", default=default_input)
52-
parser.add_argument("--output", "-o", help="Path to save JSON output", default=default_output)
53-
parser.add_argument("--no-chart", action="store_true", help="Skip chart visualization")
50+
parser.add_argument("--output", "-o", help="Path to save CSV output", default=default_output)
5451
args = parser.parse_args()
5552

5653
# Validate input file
@@ -60,12 +57,11 @@ async def main():
6057

6158
print(f"Input file: {args.input_jsonl}")
6259
print(f"Output file: {args.output}")
63-
if args.no_chart:
64-
print("Chart generation is disabled")
60+
print(f"LLM model: {args.llm}")
6561

6662
# Run evaluation
6763
try:
68-
await run_evaluation(args.input_jsonl, args.output, args.no_chart)
64+
await run_evaluation(args.input_jsonl, args.output, args.llm)
6965
except Exception as e:
7066
print(f"Error during evaluation: {str(e)}")
7167
sys.exit(1)

backend/tests/Ragas/utils/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ python-dotenv>=1.0.0
1010
pandas>=2.0.0
1111
nest_asyncio>=1.5.6
1212
matplotlib>=3.10.5
13+
ipykernel==7.0.1

0 commit comments

Comments
 (0)