1010import pandas as pd
1111from ragas import evaluate , EvaluationDataset , SingleTurnSample
1212from ragas .llms import LangchainLLMWrapper
13- from langchain_openai import ChatOpenAI
14- from ragas .metrics import FactualCorrectness , SemanticSimilarity
15- from ragas .metrics ._nv_metrics import AnswerAccuracy
13+ from langchain_openai .chat_models import ChatOpenAI
14+ from ragas .metrics import answer_relevancy , ContextRelevance , SemanticSimilarity , context_precision
1615from ragas .embeddings import LangchainEmbeddingsWrapper
1716from langchain_openai import OpenAIEmbeddings
1817from dotenv import load_dotenv
@@ -151,11 +150,15 @@ async def evaluate_with_ragas(
151150 dataset , samples , processed_data = create_ragas_dataset (data )
152151
153152 # Define metrics to use for evaluation
154- print ("Configuring default RAGAS metrics: factual_correctness, semantic_similarity, answer_accuracy" )
153+ print (
154+ "Configuring default RAGAS metrics: semantic_similarity, "
155+ "answer_relevancy, context_relevance, context_precision"
156+ )
155157 metrics = [
156- FactualCorrectness (llm = llm ),
157- SemanticSimilarity (embeddings = embeddings_wrapper ),
158- AnswerAccuracy (llm = llm ),
158+ SemanticSimilarity (),
159+ answer_relevancy ,
160+ context_precision ,
161+ ContextRelevance (llm = llm ),
159162 ]
160163
161164 # Run the evaluation
@@ -166,9 +169,10 @@ async def evaluate_with_ragas(
166169 print ("Processing evaluation results including llm_usage if present..." )
167170 # Define expected metrics for alignment and output naming
168171 expected_metrics = [
169- ("factual_correctness(mode=f1)" , "factual_correctness" ),
172+ ("nv_context_relevance" , "recontext_relevance" ),
173+ ("context_precision" , "context_precision" ),
174+ ("answer_relevancy" , "answer_relevancy" ),
170175 ("semantic_similarity" , "semantic_similarity" ),
171- ("nv_accuracy" , "answer_accuracy" ),
172176 ]
173177
174178 df = results .to_pandas ()
0 commit comments