|
10 | 10 | from ragas import evaluate, EvaluationDataset, SingleTurnSample |
11 | 11 | from ragas.llms import LangchainLLMWrapper |
12 | 12 | from langchain_openai.chat_models import ChatOpenAI |
13 | | -from ragas.metrics import answer_relevancy, ContextRelevance, SemanticSimilarity, context_precision |
| 13 | +from ragas.metrics import AnswerAccuracy , SemanticSimilarity, FactualCorrectness |
| 14 | + |
14 | 15 | from ragas.embeddings import LangchainEmbeddingsWrapper |
15 | 16 | from langchain_openai import OpenAIEmbeddings |
16 | 17 | from dotenv import load_dotenv |
@@ -67,7 +68,6 @@ def create_ragas_dataset(data): |
67 | 68 | # Create a sample using the RAGAS SingleTurnSample class |
68 | 69 | eval_sample = SingleTurnSample( |
69 | 70 | user_input=sample.get("user_input", ""), |
70 | | - retrieved_contexts=[context for context in sample.get("reference_contexts", []) if context], |
71 | 71 | response=sample.get("response", ""), |
72 | 72 | reference=reference, # Use either provided reference or first context |
73 | 73 | ) |
@@ -145,33 +145,31 @@ async def evaluate_with_ragas(jsonl_path: str) -> pd.DataFrame: |
145 | 145 |
|
146 | 146 | # Define metrics to use for evaluation |
147 | 147 | print( |
148 | | - "Configuring default RAGAS metrics: semantic_similarity, " |
149 | | - "answer_relevancy, context_relevance, context_precision" |
| 148 | + "Configuring default RAGAS metrics: semantic_similarity,factual_correctness, answer_accuracy" |
150 | 149 | ) |
151 | 150 | metrics = [ |
152 | 151 | SemanticSimilarity(), |
153 | | - answer_relevancy, |
154 | | - context_precision, |
155 | | - ContextRelevance(llm=llm), |
| 152 | + FactualCorrectness(llm=llm), |
| 153 | + AnswerAccuracy(llm=llm), |
156 | 154 | ] |
157 | 155 |
|
158 | 156 | # Run the evaluation |
159 | 157 | print("Running RAGAS evaluation (this may take a while)...") |
160 | 158 | results = evaluate(dataset=dataset, metrics=metrics, llm=llm) |
161 | | - |
162 | 159 | try: |
163 | 160 | print("Processing evaluation results including llm_usage if present...") |
164 | 161 | # Define expected metrics for alignment and output naming |
165 | 162 | expected_metrics = [ |
166 | | - ("nv_context_relevance", "recontext_relevance"), |
167 | | - ("context_precision", "context_precision"), |
168 | | - ("answer_relevancy", "answer_relevancy"), |
169 | | - ("semantic_similarity", "semantic_similarity"), |
170 | | - ] |
| 163 | + ("factual_correctness(mode=f1)", "factual_correctness"), |
| 164 | + ("nv_accuracy", "answer_accuracy"), |
| 165 | + ("semantic_similarity", "semantic_similarity"), |
| 166 | + ] |
| 167 | + |
171 | 168 |
|
172 | 169 | df = results.to_pandas() |
173 | 170 | available_columns = list(df.columns) |
174 | 171 | print(f"Results DataFrame columns: {available_columns}") |
| 172 | + |
175 | 173 |
|
176 | 174 | # Verify required columns |
177 | 175 | missing = [raw for raw, _ in expected_metrics if raw not in available_columns] |
|
0 commit comments