@@ -162,82 +162,71 @@ async def evaluate_with_ragas(
162162 print ("Running RAGAS evaluation (this may take a while)..." )
163163 results = evaluate (dataset = dataset , metrics = metrics , llm = llm )
164164
165- # Define the expected metrics we want in our final output
166- expected_metrics = ["factual_correctness" , "semantic_similarity" , "answer_accuracy" ]
167-
168- # Create a base DataFrame for our results
169- result_data = []
170-
171- # Create results for each sample with placeholder metric values and original data
172- for i , (sample , original_data ) in enumerate (zip (samples , processed_data )):
173- result_record = {
174- "question" : sample .user_input ,
175- ** {metric : None for metric in expected_metrics }, # Initialize all metrics as None
176- }
177-
178- # Include usage data if available in the original data
179- if "llm_usage" in original_data :
180- result_record ["llm_usage" ] = original_data ["llm_usage" ]
181-
182- result_data .append (result_record )
183-
184165 try :
185- if hasattr (results , "to_pandas" ):
186- scores_df = results .to_pandas ()
187- available_columns = list (scores_df .columns )
188- print (f"Results DataFrame columns: { available_columns } " )
189-
190- # Map available columns to expected metrics
191- for i , (_ , row ) in enumerate (scores_df .iterrows ()):
192- if i < len (result_data ):
193- for col in available_columns :
194- matching_metric = next (
195- (m for m in expected_metrics if m in col .lower () or col .lower () in m ), None
196- )
197- if matching_metric and i < len (result_data ):
198- result_data [i ][matching_metric ] = row [col ]
199- except Exception as e :
200- print (f"Could not convert results to DataFrame: { e } " )
201-
202- # Create DataFrame from the results
203- results_df = pd .DataFrame (result_data )
204-
205- # Calculate and add average scores
206- # Type annotation to avoid type checking issues
207- avg_data : dict = {"question" : "AVERAGE" }
208-
209- # Calculate means for each metric using non-null values
210- for metric in expected_metrics :
211- if metric in results_df .columns :
212- non_null_values = results_df [metric ].dropna ()
213- if len (non_null_values ) > 0 :
214- avg_value = float (non_null_values .mean ())
215- # Convert to float for consistency
216- avg_data [metric ] = avg_value # Using a dict with explicit type annotation allows mixed types
217- print (f"Average { metric } : { avg_value :.4f} from { len (non_null_values )} values" )
218- else :
219- avg_data [metric ] = None # None is fine for a dict with explicit type annotation
220- print (f"No valid values for { metric } " )
221-
222- # Calculate aggregate usage data if available
223- if "llm_usage" in results_df .columns :
224- usage_records = results_df ["llm_usage" ].dropna ()
225- if len (usage_records ) > 0 :
226- # Aggregate usage statistics using helper function
227- total_prompt_tokens = aggregate_usage_field (usage_records , "prompt_tokens" )
228- total_completion_tokens = aggregate_usage_field (usage_records , "completion_tokens" )
229- total_tokens = aggregate_usage_field (usage_records , "total_tokens" )
230- total_duration = aggregate_usage_field (usage_records , "duration_seconds" )
231-
232- avg_data ["llm_usage" ] = {
233- "total_prompt_tokens" : total_prompt_tokens ,
234- "total_completion_tokens" : total_completion_tokens ,
235- "total_tokens" : total_tokens ,
236- "total_duration_seconds" : total_duration ,
166+ print ("Processing evaluation results including llm_usage if present..." )
167+ # Define expected metrics for alignment and output naming
168+ expected_metrics = [
169+ ("factual_correctness(mode=f1)" , "factual_correctness" ),
170+ ("semantic_similarity" , "semantic_similarity" ),
171+ ("nv_accuracy" , "answer_accuracy" ),
172+ ]
173+
174+ df = results .to_pandas ()
175+ available_columns = list (df .columns )
176+ print (f"Results DataFrame columns: { available_columns } " )
177+
178+ # Verify required columns
179+ missing = [raw for raw , _ in expected_metrics if raw not in available_columns ]
180+ if missing :
181+ raise ValueError (
182+ f"Missing expected columns in RAGAS output: { missing } . Update column mappings or metric extraction."
183+ )
184+
185+ # Build per-sample rows with metrics and attach llm_usage from original processed_data if present
186+ rows = []
187+ for idx in range (len (df )):
188+ row_dict = {
189+ "question" : df .loc [idx , "user_input" ]
190+ if "user_input" in df .columns
191+ else processed_data [idx ].get ("user_input" , "" )
237192 }
238-
239- # Add averages row to the DataFrame
240- results_df = pd .concat ([results_df , pd .DataFrame ([avg_data ])], ignore_index = True )
193+ for raw , mapped in expected_metrics :
194+ row_dict [mapped ] = df .loc [idx , raw ]
195+ # Attach llm_usage if supplied in original input sample
196+ if "llm_usage" in processed_data [idx ]:
197+ row_dict ["llm_usage" ] = processed_data [idx ]["llm_usage" ]
198+ rows .append (row_dict )
199+
200+ results_df = pd .DataFrame (rows )
201+
202+ # Compute averages manually (exclude llm_usage from mean calc)
203+ avg_data : dict = {"question" : "AVERAGE" }
204+ metric_names = [mapped for _ , mapped in expected_metrics ]
205+ for metric in metric_names :
206+ if metric in results_df .columns :
207+ non_null = results_df [metric ].dropna ()
208+ avg_data [metric ] = float (non_null .mean ()) if len (non_null ) else None
209+
210+ # Aggregate llm usage across samples
211+ if "llm_usage" in results_df .columns :
212+ usage_records = results_df ["llm_usage" ].dropna ()
213+ if len (usage_records ) > 0 :
214+ total_prompt_tokens = aggregate_usage_field (usage_records , "prompt_tokens" )
215+ total_completion_tokens = aggregate_usage_field (usage_records , "completion_tokens" )
216+ total_tokens = aggregate_usage_field (usage_records , "total_tokens" )
217+ total_duration = aggregate_usage_field (usage_records , "duration_seconds" )
218+ avg_data ["llm_usage" ] = {
219+ "total_prompt_tokens" : total_prompt_tokens ,
220+ "total_completion_tokens" : total_completion_tokens ,
221+ "total_tokens" : total_tokens ,
222+ "total_duration_seconds" : total_duration ,
223+ }
224+
225+ # Append average row
226+ results_df = pd .concat ([results_df , pd .DataFrame ([avg_data ])], ignore_index = True )
227+ except Exception as e :
228+ print (f"Could not process RAGAS results with llm_usage: { e } " )
229+ raise
241230
242231 # Save results and generate visualization
243232 if output_json_path :
0 commit comments