From 8e29433b6732eb0a2748df334f6d149158e5260d Mon Sep 17 00:00:00 2001 From: Shahul ES Date: Thu, 6 Jul 2023 12:47:24 +0530 Subject: [PATCH] Rename metrics (#48) Rename factuality to faithfulness to convey the idea correctly and in favor of incoming feature that measures factual consistency. --- README.md | 8 +++--- docs/assets/bar-graph.svg | 2 +- docs/metrics.md | 6 ++--- examples/quickstart.ipynb | 16 ++++++------ .../assesments/metrics_assesments.ipynb | 26 +++++++++---------- .../dataset-exploration-and-baseline.ipynb | 4 +-- references.md | 2 +- src/ragas/evaluation.py | 6 ++--- src/ragas/metrics/__init__.py | 6 ++--- src/ragas/metrics/factual.py | 6 ++--- tests/benchmarks/benchmark_eval.py | 4 +-- 11 files changed, 43 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index f967bdf84..e416fa7f7 100644 --- a/README.md +++ b/README.md @@ -74,14 +74,14 @@ dataset: Dataset results = evaluate(dataset) # {'ragas_score': 0.860, 'context_relavency': 0.817, -# 'factuality': 0.892, 'answer_relevancy': 0.874} +# 'faithfulness': 0.892, 'answer_relevancy': 0.874} ``` If you want a more in-depth explanation of core components, check out our [quick-start notebook](./examples/quickstart.ipynb) ## :luggage: Metrics Ragas measures your pipeline's performance against two dimensions -1. **Factuality**: measures the factual consistency of the generated answer against the given context. -2. **Relevancy**: measures how relevant retrieved contexts and the generated answer are to the question. +1. **Faithfulness**: measures the information consistency of the generated answer against the given context. If any claims made in the answer that cannot be deduced from context is penalized. +2. **Relevancy**: measures how relevant retrieved contexts and the generated answer are to the question. The presence of extra or redundant information is penalized. Through repeated experiments, we have found that the quality of a RAG pipeline is highly dependent on these two dimensions. The final `ragas_score` is the harmonic mean of these two factors. @@ -103,7 +103,7 @@ If you want to get more involved with Ragas, check out our [discord server](http ## :raising_hand_man: FAQ 1. Why harmonic mean? -Harmonic mean penalizes extreme values. For example, if your generated answer is fully factually consistent with the context (factuality = 1) but is not relevant to the question (relevancy = 0), a simple average would give you a score of 0.5 but a harmonic mean will give you 0.0 +Harmonic mean penalizes extreme values. For example, if your generated answer is fully factually consistent with the context (faithfulness = 1) but is not relevant to the question (relevancy = 0), a simple average would give you a score of 0.5 but a harmonic mean will give you 0.0 diff --git a/docs/assets/bar-graph.svg b/docs/assets/bar-graph.svg index 0b94b295e..dd24e26c5 100644 --- a/docs/assets/bar-graph.svg +++ b/docs/assets/bar-graph.svg @@ -1 +1 @@ -Wiki QA datasetfactualityrelevanceGPT3.5ragas020406080100method usedCorrelation against human judgement \ No newline at end of file +Wiki QA datasetfaithfulnessrelevanceGPT3.5ragas020406080100method usedCorrelation against human judgement \ No newline at end of file diff --git a/docs/metrics.md b/docs/metrics.md index 25ba7b376..3a81498a7 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,15 +1,15 @@ # Metrics -1. `factuality` : measures the factual consistency of the generated answer against the given context. This is done using a multi step paradigm that includes creation of statements from the generated answer followed by verifying each of these statements against the context. The answer is scaled to (0,1) range. Higher the better. +1. `faithfulness` : measures the factual consistency of the generated answer against the given context. This is done using a multi step paradigm that includes creation of statements from the generated answer followed by verifying each of these statements against the context. The answer is scaled to (0,1) range. Higher the better. ```python -from ragas.metrics import factuality +from ragas.metrics import faithfulness # Dataset({ # features: ['question','contexts','answer'], # num_rows: 25 # }) dataset: Dataset -results = evaluate(dataset, metrics=[factuality]) +results = evaluate(dataset, metrics=[faithfulness]) ``` 2. `answer_relevancy`: measures how relevant is the generated answer to the prompt. This is quantified using conditional likelihood of an LLM generating the question given the answer. This is implemented using a custom model. Values range (0,1), higher the better. ```python diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb index 3793695eb..f01be8446 100644 --- a/examples/quickstart.ipynb +++ b/examples/quickstart.ipynb @@ -122,7 +122,7 @@ "\n", "Ragas measures your pipeline's performance against two dimensions\n", "\n", - "1. Factuality: measures the factual consistency of the generated answer against the given context.\n", + "1. Faithfulness: measures the factual consistency of the generated answer against the given context.\n", "2. Relevancy: measures how relevant retrieved contexts and the generated answer are to the question.\n", "\n", "Through repeated experiments, we have found that the quality of a RAG pipeline is highly dependent on these two dimensions. The final `ragas_score` is the harmonic mean of these two factors.\n", @@ -137,7 +137,7 @@ "metadata": {}, "outputs": [], "source": [ - "from ragas.metrics import context_relevancy, answer_relevancy, factuality" + "from ragas.metrics import context_relevancy, answer_relevancy, faithfulness" ] }, { @@ -149,9 +149,9 @@ "\n", "1. context_relevancy - a measure of how relevent the retrieved context is to the question. Conveys quality of the retrieval pipeline.\n", "2. answer_relevancy - a measure of how relevent the answer is to the question\n", - "3. factuality - the factual consistancy of the answer to the context base on the question.\n", + "3. faithfulness - the factual consistancy of the answer to the context base on the question.\n", "\n", - "**Note:** *`factuality` using OpenAI's API to compute the score. If you using this metric make sure you set the environment key `OPENAI_API_KEY` with your API key.*\n", + "**Note:** *`faithfulness` using OpenAI's API to compute the score. If you using this metric make sure you set the environment key `OPENAI_API_KEY` with your API key.*\n", "\n", "**Note:** *`context_relevancy` and `answer_relevancy` use very small LLMs to compute the score. It will run on CPU but having a GPU is recommended.*\n", "\n", @@ -188,7 +188,7 @@ { "data": { "text/plain": [ - "{'ragas_score': 0.860, 'context_relavency': 0.817, 'factuality': 0.892, 'answer_relevancy': 0.874}" + "{'ragas_score': 0.860, 'context_relavency': 0.817, 'faithfulness': 0.892, 'answer_relevancy': 0.874}" ] }, "execution_count": 8, @@ -200,7 +200,7 @@ "from ragas import evaluate\n", "\n", "result = evaluate(\n", - " fiqa_eval[\"baseline\"], metrics=[context_relevancy, factuality, answer_relevancy]\n", + " fiqa_eval[\"baseline\"], metrics=[context_relevancy, faithfulness, answer_relevancy]\n", ")\n", "\n", "result" @@ -248,7 +248,7 @@ " answer\n", " contexts\n", " context_relavency\n", - " factuality\n", + " faithfulness\n", " answer_relevancy\n", " \n", " \n", @@ -336,7 +336,7 @@ "3 [Set up a meeting with the bank that handles y... 0.781 \n", "4 [The time horizon for your 401K/IRA is essenti... 0.737 \n", "\n", - " factuality answer_relevancy \n", + " faithfulness answer_relevancy \n", "0 1.0 0.922 \n", "1 1.0 0.923 \n", "2 1.0 0.824 \n", diff --git a/experiments/assesments/metrics_assesments.ipynb b/experiments/assesments/metrics_assesments.ipynb index f126ad874..ab79696d8 100644 --- a/experiments/assesments/metrics_assesments.ipynb +++ b/experiments/assesments/metrics_assesments.ipynb @@ -6,11 +6,11 @@ "metadata": {}, "source": [ "## Logs\n", - "- Factuality NLI\n", + "- Faithfulness NLI\n", " - Without CoT\n", " - With CoT ( WIN) \n", " - WikiQA \n", - " - generated non factual answer for measuring factuality agreement.\n", + " - generated non factual answer for measuring faithfulness agreement.\n", " - Kendall Score = 0.7\n", " - HotPotQA\n", " - Accuracy = 0.75 \n", @@ -334,12 +334,12 @@ "reply:{}\n", "score:\"\"\"\n", "\n", - "factuality = \"\"\"\n", + "faithfulness = \"\"\"\n", "Evaluation Criteria.\\n\n", - "Factuality (1-5) - how factually consistant is the reply with the given context.\n", + "Faithfulness (1-5) - how factually consistant is the reply with the given context.\n", "1. Read the reply and compare it to the question. Check if the given reply\n", "actually answers the question correctly, and if the reply is factualy consistent with the context.\n", - "2. Assign a score for factuality on a scale of 1 to 5, where 1 is the lowest and\n", + "2. Assign a score for faithfulness on a scale of 1 to 5, where 1 is the lowest and\n", "5 is the highest based on the Evaluation Criteria.\n", "\n", "context: {}\n", @@ -355,8 +355,8 @@ "metadata": {}, "outputs": [], "source": [ - "def gpt_factuality(question:list, context:list, answer:list):\n", - " prompt = [factuality.format(c,q, a) for c,q,a in zip(question,context,answer)]\n", + "def gpt_faithfulness(question:list, context:list, answer:list):\n", + " prompt = [faithfulness.format(c,q, a) for c,q,a in zip(question,context,answer)]\n", " output = [output for output in llm(prompt)['choices']]\n", " scores = [(out[\"text\"].strip()) for out in output ]\n", " scores = [int(score) if score in ['1','2','3','4','5'] else 1 for score in scores]\n", @@ -446,7 +446,7 @@ } ], "source": [ - "gpt_factuality([q],[c], [a])" + "gpt_faithfulness([q],[c], [a])" ] }, { @@ -522,7 +522,7 @@ " passage = examples[col]\n", " inputs = list(zip(questions, passage))\n", " #scores[f\"{col}_relevance\"] = t5_qgen.predict(inputs, show_progress=False)\n", - " scores[f\"{col}_relevance\"] = gpt_factuality(questions,context,passage)\n", + " scores[f\"{col}_relevance\"] = gpt_faithfulness(questions,context,passage)\n", " return scores" ] }, @@ -594,7 +594,7 @@ "id": "89d8ccbc", "metadata": {}, "source": [ - "## Factuality" + "## Faithfulness" ] }, { @@ -725,7 +725,7 @@ "id": "cefd9923", "metadata": {}, "source": [ - "## Factuality on HotpotQA\n" + "## Faithfulness on HotpotQA\n" ] }, { @@ -877,7 +877,7 @@ "metadata": {}, "outputs": [], "source": [ - "def predict_factuality(examples,scoring_fun=NLI.score):\n", + "def predict_faithfulness(examples,scoring_fun=NLI.score):\n", " scores = {}\n", " questions = examples[\"question\"]\n", " contexts = examples[\"answer_context\"]\n", @@ -909,7 +909,7 @@ ], "source": [ "COLUMNS = [\"answer\", \"false_answer\"]\n", - "hotpot_qa = hotpot_qa.map(predict_factuality, batched=True, batch_size=8)" + "hotpot_qa = hotpot_qa.map(predict_faithfulness, batched=True, batch_size=8)" ] }, { diff --git a/experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb b/experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb index 97ca5e67a..69c595ea7 100644 --- a/experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb +++ b/experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb @@ -1513,10 +1513,10 @@ } ], "source": [ - "from ragas.metrics import factuality, answer_relevancy, context_relevancy\n", + "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy\n", "from ragas import evaluate\n", "\n", - "evaluate(gen_ds, metrics=[factuality, answer_relevancy, context_relevancy])" + "evaluate(gen_ds, metrics=[faithfulness, answer_relevancy, context_relevancy])" ] }, { diff --git a/references.md b/references.md index 5af7c6e27..9ef07ae12 100644 --- a/references.md +++ b/references.md @@ -18,7 +18,7 @@ DOI=https://doi.org/10.48550/arXiv.1904.09675 } @misc{ -title={On Faithfulness and Factuality in Abstractive Summarization}, +title={On Faithfulness and Faithfulness in Abstractive Summarization}, author={Maynez* et al.}, year={2020}, DOI=https://doi.org/10.48550/arXiv.2005.00661 } diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 51f7c30fe..dbb39b9c3 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -62,7 +62,7 @@ def evaluate( >>> result = evaluate(dataset) >>> print(result["ragas_score"]) - {'ragas_score': 0.860, 'context_relavency': 0.817, 'factuality': 0.892, + {'ragas_score': 0.860, 'context_relavency': 0.817, 'faithfulness': 0.892, 'answer_relevancy': 0.874} ``` """ @@ -75,9 +75,9 @@ def evaluate( # TODO: check if all the metrics are compatible with the evaluation mode if metrics is None: - from ragas.metrics import answer_relevancy, context_relevancy, factuality + from ragas.metrics import answer_relevancy, context_relevancy, faithfulness - metrics = [answer_relevancy, context_relevancy, factuality] + metrics = [answer_relevancy, context_relevancy, faithfulness] # run the evaluation on dataset with different metrics # initialize all the models in the metrics diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 251afcfef..ce96f04ae 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,10 +1,10 @@ from ragas.metrics.answer_relevance import AnswerRelevancy, answer_relevancy from ragas.metrics.context_relevance import ContextRelevancy, context_relevancy -from ragas.metrics.factual import Factuality, factuality +from ragas.metrics.factual import Faithfulness, faithfulness __all__ = [ - "Factuality", - "factuality", + "Faithfulness", + "faithfulness", "AnswerRelevancy", "answer_relevancy", "ContextRelevancy", diff --git a/src/ragas/metrics/factual.py b/src/ragas/metrics/factual.py index b1ede6935..151159541 100644 --- a/src/ragas/metrics/factual.py +++ b/src/ragas/metrics/factual.py @@ -57,12 +57,12 @@ @dataclass -class Factuality(Metric): +class Faithfulness(Metric): batch_size: int = 15 @property def name(self): - return "factuality" + return "faithfulness" def init_model(self: t.Self): pass @@ -124,4 +124,4 @@ def _score_batch(self: t.Self, ds: Dataset) -> Dataset: return ds.add_column(f"{self.name}", scores) # type: ignore -factuality = Factuality() +faithfulness = Faithfulness() diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index d595a65e1..692db6973 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -4,7 +4,7 @@ from torch.cuda import is_available from ragas import evaluate -from ragas.metrics import answer_relevancy, context_relavancy, factuality +from ragas.metrics import answer_relevancy, context_relevancy, faithfulness DEVICE = "cuda" if is_available() else "cpu" @@ -16,6 +16,6 @@ if __name__ == "__main__": result = evaluate( ds, - metrics=[answer_relevancy, context_relavancy, factuality], + metrics=[answer_relevancy, context_relevancy, faithfulness], ) print(result)