Skip to content

Commit c06c92c

Browse files
Ragas evaluate to include correct mapping for answer accuracy (#36)
* Ragas evaluate to include correct mapping for answer accuracy * Refactor RAGAS evaluation result processing to improve column mapping and error handling * Refactor evaluate_with_ragas to improve error handling and streamline DataFrame processing * resolving Type check errors * Improve average calculation in evaluate_with_ragas to handle empty numeric DataFrame * Resolve to_dict type error * Checking the type * Resolvign Type error * Fix average calculation in evaluate_with_ragas to handle different mean types * Enhance type hinting in evaluate_with_ragas for better type checking and clarity * Refactor evaluate_with_ragas to improve DataFrame handling and type assertions * Changes based on the review comments * Linting error * Linting error
1 parent a078396 commit c06c92c

File tree

1 file changed

+63
-74
lines changed

1 file changed

+63
-74
lines changed

backend/tests/Ragas/utils/modules/ragas_evaluation.py

Lines changed: 63 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -162,82 +162,71 @@ async def evaluate_with_ragas(
162162
print("Running RAGAS evaluation (this may take a while)...")
163163
results = evaluate(dataset=dataset, metrics=metrics, llm=llm)
164164

165-
# Define the expected metrics we want in our final output
166-
expected_metrics = ["factual_correctness", "semantic_similarity", "answer_accuracy"]
167-
168-
# Create a base DataFrame for our results
169-
result_data = []
170-
171-
# Create results for each sample with placeholder metric values and original data
172-
for i, (sample, original_data) in enumerate(zip(samples, processed_data)):
173-
result_record = {
174-
"question": sample.user_input,
175-
**{metric: None for metric in expected_metrics}, # Initialize all metrics as None
176-
}
177-
178-
# Include usage data if available in the original data
179-
if "llm_usage" in original_data:
180-
result_record["llm_usage"] = original_data["llm_usage"]
181-
182-
result_data.append(result_record)
183-
184165
try:
185-
if hasattr(results, "to_pandas"):
186-
scores_df = results.to_pandas()
187-
available_columns = list(scores_df.columns)
188-
print(f"Results DataFrame columns: {available_columns}")
189-
190-
# Map available columns to expected metrics
191-
for i, (_, row) in enumerate(scores_df.iterrows()):
192-
if i < len(result_data):
193-
for col in available_columns:
194-
matching_metric = next(
195-
(m for m in expected_metrics if m in col.lower() or col.lower() in m), None
196-
)
197-
if matching_metric and i < len(result_data):
198-
result_data[i][matching_metric] = row[col]
199-
except Exception as e:
200-
print(f"Could not convert results to DataFrame: {e}")
201-
202-
# Create DataFrame from the results
203-
results_df = pd.DataFrame(result_data)
204-
205-
# Calculate and add average scores
206-
# Type annotation to avoid type checking issues
207-
avg_data: dict = {"question": "AVERAGE"}
208-
209-
# Calculate means for each metric using non-null values
210-
for metric in expected_metrics:
211-
if metric in results_df.columns:
212-
non_null_values = results_df[metric].dropna()
213-
if len(non_null_values) > 0:
214-
avg_value = float(non_null_values.mean())
215-
# Convert to float for consistency
216-
avg_data[metric] = avg_value # Using a dict with explicit type annotation allows mixed types
217-
print(f"Average {metric}: {avg_value:.4f} from {len(non_null_values)} values")
218-
else:
219-
avg_data[metric] = None # None is fine for a dict with explicit type annotation
220-
print(f"No valid values for {metric}")
221-
222-
# Calculate aggregate usage data if available
223-
if "llm_usage" in results_df.columns:
224-
usage_records = results_df["llm_usage"].dropna()
225-
if len(usage_records) > 0:
226-
# Aggregate usage statistics using helper function
227-
total_prompt_tokens = aggregate_usage_field(usage_records, "prompt_tokens")
228-
total_completion_tokens = aggregate_usage_field(usage_records, "completion_tokens")
229-
total_tokens = aggregate_usage_field(usage_records, "total_tokens")
230-
total_duration = aggregate_usage_field(usage_records, "duration_seconds")
231-
232-
avg_data["llm_usage"] = {
233-
"total_prompt_tokens": total_prompt_tokens,
234-
"total_completion_tokens": total_completion_tokens,
235-
"total_tokens": total_tokens,
236-
"total_duration_seconds": total_duration,
166+
print("Processing evaluation results including llm_usage if present...")
167+
# Define expected metrics for alignment and output naming
168+
expected_metrics = [
169+
("factual_correctness(mode=f1)", "factual_correctness"),
170+
("semantic_similarity", "semantic_similarity"),
171+
("nv_accuracy", "answer_accuracy"),
172+
]
173+
174+
df = results.to_pandas()
175+
available_columns = list(df.columns)
176+
print(f"Results DataFrame columns: {available_columns}")
177+
178+
# Verify required columns
179+
missing = [raw for raw, _ in expected_metrics if raw not in available_columns]
180+
if missing:
181+
raise ValueError(
182+
f"Missing expected columns in RAGAS output: {missing}. Update column mappings or metric extraction."
183+
)
184+
185+
# Build per-sample rows with metrics and attach llm_usage from original processed_data if present
186+
rows = []
187+
for idx in range(len(df)):
188+
row_dict = {
189+
"question": df.loc[idx, "user_input"]
190+
if "user_input" in df.columns
191+
else processed_data[idx].get("user_input", "")
237192
}
238-
239-
# Add averages row to the DataFrame
240-
results_df = pd.concat([results_df, pd.DataFrame([avg_data])], ignore_index=True)
193+
for raw, mapped in expected_metrics:
194+
row_dict[mapped] = df.loc[idx, raw]
195+
# Attach llm_usage if supplied in original input sample
196+
if "llm_usage" in processed_data[idx]:
197+
row_dict["llm_usage"] = processed_data[idx]["llm_usage"]
198+
rows.append(row_dict)
199+
200+
results_df = pd.DataFrame(rows)
201+
202+
# Compute averages manually (exclude llm_usage from mean calc)
203+
avg_data: dict = {"question": "AVERAGE"}
204+
metric_names = [mapped for _, mapped in expected_metrics]
205+
for metric in metric_names:
206+
if metric in results_df.columns:
207+
non_null = results_df[metric].dropna()
208+
avg_data[metric] = float(non_null.mean()) if len(non_null) else None
209+
210+
# Aggregate llm usage across samples
211+
if "llm_usage" in results_df.columns:
212+
usage_records = results_df["llm_usage"].dropna()
213+
if len(usage_records) > 0:
214+
total_prompt_tokens = aggregate_usage_field(usage_records, "prompt_tokens")
215+
total_completion_tokens = aggregate_usage_field(usage_records, "completion_tokens")
216+
total_tokens = aggregate_usage_field(usage_records, "total_tokens")
217+
total_duration = aggregate_usage_field(usage_records, "duration_seconds")
218+
avg_data["llm_usage"] = {
219+
"total_prompt_tokens": total_prompt_tokens,
220+
"total_completion_tokens": total_completion_tokens,
221+
"total_tokens": total_tokens,
222+
"total_duration_seconds": total_duration,
223+
}
224+
225+
# Append average row
226+
results_df = pd.concat([results_df, pd.DataFrame([avg_data])], ignore_index=True)
227+
except Exception as e:
228+
print(f"Could not process RAGAS results with llm_usage: {e}")
229+
raise
241230

242231
# Save results and generate visualization
243232
if output_json_path:

0 commit comments

Comments
 (0)