From db2c434a38b4f9f89355b16c91d8c86c8671f1bf Mon Sep 17 00:00:00 2001 From: MilesHolland <108901744+MilesHolland@users.noreply.github.com> Date: Fri, 27 Dec 2024 13:22:36 -0500 Subject: [PATCH] qa uses base classes and is testable (#38993) * qa uses base classes and is testable * evaluator list input for base class --- .../azure-ai-evaluation/assets.json | 2 +- .../_evaluators/_common/__init__.py | 2 + .../_evaluators/_common/_base_multi_eval.py | 61 +++++++++++++++++++ .../_content_safety/_content_safety.py | 47 ++------------ .../ai/evaluation/_evaluators/_qa/_qa.py | 59 ++++++++++-------- .../tests/e2etests/test_mass_evaluate.py | 60 +++++++++--------- 6 files changed, 131 insertions(+), 100 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 7db6011b3d29..ffb51abc2417 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_326efc986d" + "Tag": "python/evaluation/azure-ai-evaluation_23e89ff5ac" } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/__init__.py index f113ac93325d..e883883e21d4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/__init__.py @@ -5,9 +5,11 @@ from ._base_eval import EvaluatorBase from ._base_prompty_eval import PromptyEvaluatorBase from ._base_rai_svc_eval import RaiServiceEvaluatorBase +from ._base_multi_eval import MultiEvaluatorBase __all__ = [ "EvaluatorBase", "PromptyEvaluatorBase", "RaiServiceEvaluatorBase", + "MultiEvaluatorBase", ] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py new file mode 100644 index 000000000000..785b9b86f7f3 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py @@ -0,0 +1,61 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from concurrent.futures import as_completed +from typing import TypeVar, Dict, List + +from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor +from typing_extensions import override + +from azure.ai.evaluation._evaluators._common import EvaluatorBase + +T = TypeVar("T") + + +class MultiEvaluatorBase(EvaluatorBase[T]): + """ + Base class for evaluators that contain and run multiple other evaluators to produce a + suite of metrics. + + Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval. + + :param evaluators: The list of evaluators to run when this evaluator is called. + :type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase] + :param kwargs: Additional arguments to pass to the evaluator. + :type kwargs: Any + :return: An evaluator that runs multiple other evaluators and combines their results. + """ + + def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs): + super().__init__() + self._parallel = kwargs.pop("_parallel", True) + self._evaluators = evaluators + + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, T]: + """Run each evaluator, possibly in parallel, and combine the results into + a single large dictionary containing each evaluation. Inputs are passed + directly to each evaluator without additional processing. + + + :param eval_input: The input to the evaluation function. + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + results: Dict[str, T] = {} + if self._parallel: + with ThreadPoolExecutor() as executor: + # pylint: disable=no-value-for-parameter + futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators} + + for future in as_completed(futures): + results.update(future.result()) + else: + for evaluator in self._evaluators: + result = evaluator(**eval_input) + # Ignore is to avoid mypy getting upset over the amount of duck-typing + # that's going on to shove evaluators around like this. + results.update(result) # type: ignore[arg-type] + + return results diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 23c1e174e1c3..85814f57915e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -1,13 +1,11 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from concurrent.futures import as_completed -from typing import Callable, Dict, List, Union +from typing import Dict, List, Union -from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor from typing_extensions import overload, override -from azure.ai.evaluation._evaluators._common import EvaluatorBase +from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase from azure.ai.evaluation._model_configurations import Conversation from azure.ai.evaluation._common._experimental import experimental @@ -18,7 +16,7 @@ @experimental -class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]): +class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]): """ Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario. @@ -44,16 +42,14 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]): id = "content_safety" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" - # TODO address 3579092 to re-enabled parallel evals. def __init__(self, credential, azure_ai_project, **kwargs): - super().__init__() - self._parallel = kwargs.pop("_parallel", True) - self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ + evaluators = [ ViolenceEvaluator(credential, azure_ai_project), SexualEvaluator(credential, azure_ai_project), SelfHarmEvaluator(credential, azure_ai_project), HateUnfairnessEvaluator(credential, azure_ai_project), ] + super().__init__(evaluators=evaluators, **kwargs) @overload def __call__( @@ -109,36 +105,3 @@ def __call__( # pylint: disable=docstring-missing-param :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]] """ return super().__call__(*args, **kwargs) - - @override - async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: - """Perform the evaluation using the Azure AI RAI service. - The exact evaluation performed is determined by the evaluation metric supplied - by the child class initializer. - - :param eval_input: The input to the evaluation function. - :type eval_input: Dict - :return: The evaluation result. - :rtype: Dict - """ - query = eval_input.get("query", None) - response = eval_input.get("response", None) - conversation = eval_input.get("conversation", None) - results: Dict[str, Union[str, float]] = {} - # TODO fix this to not explode on empty optional inputs (PF SKD error) - if self._parallel: - with ThreadPoolExecutor() as executor: - # pylint: disable=no-value-for-parameter - futures = { - executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator - for evaluator in self._evaluators - } - - for future in as_completed(futures): - results.update(future.result()) - else: - for evaluator in self._evaluators: - result = evaluator(query=query, response=response, conversation=conversation) - results.update(result) - - return results diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index ab39cfe049ad..ede78ada5a4f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -2,10 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from concurrent.futures import as_completed -from typing import Callable, Dict, List, Union +from typing import Union -from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor +from typing_extensions import overload, override + +from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase from .._coherence import CoherenceEvaluator from .._f1_score import F1ScoreEvaluator @@ -15,7 +16,7 @@ from .._similarity import SimilarityEvaluator -class QAEvaluator: +class QAEvaluator(MultiEvaluatorBase[Union[str, float]]): """ Initialize a question-answer evaluator configured for a specific Azure OpenAI model. @@ -46,9 +47,7 @@ class QAEvaluator: """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" def __init__(self, model_config, **kwargs): - self._parallel = kwargs.pop("_parallel", False) - - self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [ + evaluators = [ GroundednessEvaluator(model_config), RelevanceEvaluator(model_config), CoherenceEvaluator(model_config), @@ -56,8 +55,31 @@ def __init__(self, model_config, **kwargs): SimilarityEvaluator(model_config), F1ScoreEvaluator(), ] + super().__init__(evaluators=evaluators, **kwargs) + + @overload # type: ignore + def __call__(self, *, query: str, response: str, context: str, ground_truth: str): + """ + Evaluates question-answering scenario. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword context: The context to be evaluated. + :paramtype context: str + :keyword ground_truth: The ground truth to be evaluated. + :paramtype ground_truth: str + :return: The scores for QA scenario. + :rtype: Dict[str, Union[str, float]] + """ - def __call__(self, *, query: str, response: str, context: str, ground_truth: str, **kwargs): + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): """ Evaluates question-answering scenario. @@ -72,22 +94,5 @@ def __call__(self, *, query: str, response: str, context: str, ground_truth: str :return: The scores for QA scenario. :rtype: Dict[str, Union[str, float]] """ - results: Dict[str, Union[str, float]] = {} - if self._parallel: - with ThreadPoolExecutor() as executor: - futures = { - executor.submit( - evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs - ): evaluator - for evaluator in self._evaluators - } - - # Collect results as they complete - for future in as_completed(futures): - results.update(future.result()) - else: - for evaluator in self._evaluators: - result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs) - results.update(result) - - return results + + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py index 93c592c92db9..08048fead716 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py @@ -19,7 +19,7 @@ RelevanceEvaluator, SimilarityEvaluator, GroundednessEvaluator, - # QAEvaluator, + QAEvaluator, ContentSafetyEvaluator, GroundednessProEvaluator, ProtectedMaterialEvaluator, @@ -88,7 +88,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope "fluency": FluencyEvaluator(model_config), "relevance": RelevanceEvaluator(model_config), "similarity": SimilarityEvaluator(model_config), - # "qa": QAEvaluator(model_config), + "qa": QAEvaluator(model_config), "grounded_pro": GroundednessProEvaluator(azure_cred, project_scope), "protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope), "indirect_attack": IndirectAttackEvaluator(azure_cred, project_scope), @@ -105,7 +105,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope row_result_df = pd.DataFrame(result["rows"]) metrics = result["metrics"] - assert len(row_result_df.keys()) == 48 # 63 with qa + assert len(row_result_df.keys()) == 63 assert len(row_result_df["inputs.query"]) == 3 assert len(row_result_df["inputs.context"]) == 3 assert len(row_result_df["inputs.response"]) == 3 @@ -154,23 +154,23 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope assert len(row_result_df["outputs.content_safety.violence"]) == 3 assert len(row_result_df["outputs.content_safety.violence_score"]) == 3 assert len(row_result_df["outputs.content_safety.violence_reason"]) == 3 - # assert len(row_result_df["outputs.qa.f1_score"]) == 3 - # assert len(row_result_df["outputs.qa.groundedness"]) == 3 - # assert len(row_result_df["outputs.qa.gpt_groundedness"]) == 3 - # assert len(row_result_df["outputs.qa.groundedness_reason"]) == 3 - # assert len(row_result_df["outputs.qa.coherence"]) == 3 - # assert len(row_result_df["outputs.qa.gpt_coherence"]) == 3 - # assert len(row_result_df["outputs.qa.coherence_reason"]) == 3 - # assert len(row_result_df["outputs.qa.fluency"]) == 3 - # assert len(row_result_df["outputs.qa.gpt_fluency"]) == 3 - # assert len(row_result_df["outputs.qa.fluency_reason"]) == 3 - # assert len(row_result_df["outputs.qa.relevance"]) == 3 - # assert len(row_result_df["outputs.qa.gpt_relevance"]) == 3 - # assert len(row_result_df["outputs.qa.relevance_reason"]) == 3 - # assert len(row_result_df["outputs.qa.similarity"]) == 3 - # assert len(row_result_df["outputs.qa.gpt_similarity"]) == 3 + assert len(row_result_df["outputs.qa.f1_score"]) == 3 + assert len(row_result_df["outputs.qa.groundedness"]) == 3 + assert len(row_result_df["outputs.qa.gpt_groundedness"]) == 3 + assert len(row_result_df["outputs.qa.groundedness_reason"]) == 3 + assert len(row_result_df["outputs.qa.coherence"]) == 3 + assert len(row_result_df["outputs.qa.gpt_coherence"]) == 3 + assert len(row_result_df["outputs.qa.coherence_reason"]) == 3 + assert len(row_result_df["outputs.qa.fluency"]) == 3 + assert len(row_result_df["outputs.qa.gpt_fluency"]) == 3 + assert len(row_result_df["outputs.qa.fluency_reason"]) == 3 + assert len(row_result_df["outputs.qa.relevance"]) == 3 + assert len(row_result_df["outputs.qa.gpt_relevance"]) == 3 + assert len(row_result_df["outputs.qa.relevance_reason"]) == 3 + assert len(row_result_df["outputs.qa.similarity"]) == 3 + assert len(row_result_df["outputs.qa.gpt_similarity"]) == 3 - assert len(metrics.keys()) == 28 # 39 with qa + assert len(metrics.keys()) == 39 assert metrics["f1_score.f1_score"] >= 0 assert metrics["gleu.gleu_score"] >= 0 assert metrics["bleu.bleu_score"] >= 0 @@ -199,17 +199,17 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope assert metrics["protected_material.protected_material_defect_rate"] >= 0 assert metrics["indirect_attack.xpia_defect_rate"] >= 0 assert metrics["eci.eci_defect_rate"] >= 0 - # assert metrics["qa.f1_score"] >= 0 - # assert metrics["qa.groundedness"] >= 0 - # assert metrics["qa.gpt_groundedness"] >= 0 - # assert metrics["qa.coherence"] >= 0 - # assert metrics["qa.gpt_coherence"] >= 0 - # assert metrics["qa.fluency"] >= 0 - # assert metrics["qa.gpt_fluency"] >= 0 - # assert metrics["qa.relevance"] >= 0 - # assert metrics["qa.gpt_relevance"] >= 0 - # assert metrics["qa.similarity"] >= 0 - # assert metrics["qa.gpt_similarity"] >= 0 + assert metrics["qa.f1_score"] >= 0 + assert metrics["qa.groundedness"] >= 0 + assert metrics["qa.gpt_groundedness"] >= 0 + assert metrics["qa.coherence"] >= 0 + assert metrics["qa.gpt_coherence"] >= 0 + assert metrics["qa.fluency"] >= 0 + assert metrics["qa.gpt_fluency"] >= 0 + assert metrics["qa.relevance"] >= 0 + assert metrics["qa.gpt_relevance"] >= 0 + assert metrics["qa.similarity"] >= 0 + assert metrics["qa.gpt_similarity"] >= 0 def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred, project_scope): evaluators = {