Skip to content

Commit

Permalink
qa uses base classes and is testable (Azure#38993)
Browse files Browse the repository at this point in the history
* qa uses base classes and is testable

* evaluator list input for base class
  • Loading branch information
MilesHolland authored Dec 27, 2024
1 parent c694e3f commit db2c434
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 100 deletions.
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_326efc986d"
"Tag": "python/evaluation/azure-ai-evaluation_23e89ff5ac"
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from ._base_eval import EvaluatorBase
from ._base_prompty_eval import PromptyEvaluatorBase
from ._base_rai_svc_eval import RaiServiceEvaluatorBase
from ._base_multi_eval import MultiEvaluatorBase

__all__ = [
"EvaluatorBase",
"PromptyEvaluatorBase",
"RaiServiceEvaluatorBase",
"MultiEvaluatorBase",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from concurrent.futures import as_completed
from typing import TypeVar, Dict, List

from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
from typing_extensions import override

from azure.ai.evaluation._evaluators._common import EvaluatorBase

T = TypeVar("T")


class MultiEvaluatorBase(EvaluatorBase[T]):
"""
Base class for evaluators that contain and run multiple other evaluators to produce a
suite of metrics.
Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval.
:param evaluators: The list of evaluators to run when this evaluator is called.
:type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase]
:param kwargs: Additional arguments to pass to the evaluator.
:type kwargs: Any
:return: An evaluator that runs multiple other evaluators and combines their results.
"""

def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
super().__init__()
self._parallel = kwargs.pop("_parallel", True)
self._evaluators = evaluators

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
"""Run each evaluator, possibly in parallel, and combine the results into
a single large dictionary containing each evaluation. Inputs are passed
directly to each evaluator without additional processing.
:param eval_input: The input to the evaluation function.
:type eval_input: Dict
:return: The evaluation result.
:rtype: Dict
"""
results: Dict[str, T] = {}
if self._parallel:
with ThreadPoolExecutor() as executor:
# pylint: disable=no-value-for-parameter
futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators}

for future in as_completed(futures):
results.update(future.result())
else:
for evaluator in self._evaluators:
result = evaluator(**eval_input)
# Ignore is to avoid mypy getting upset over the amount of duck-typing
# that's going on to shove evaluators around like this.
results.update(result) # type: ignore[arg-type]

return results
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from concurrent.futures import as_completed
from typing import Callable, Dict, List, Union
from typing import Dict, List, Union

from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
from typing_extensions import overload, override

from azure.ai.evaluation._evaluators._common import EvaluatorBase
from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
from azure.ai.evaluation._model_configurations import Conversation
from azure.ai.evaluation._common._experimental import experimental

Expand All @@ -18,7 +16,7 @@


@experimental
class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
"""
Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
Expand All @@ -44,16 +42,14 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
id = "content_safety"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

# TODO address 3579092 to re-enabled parallel evals.
def __init__(self, credential, azure_ai_project, **kwargs):
super().__init__()
self._parallel = kwargs.pop("_parallel", True)
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
evaluators = [
ViolenceEvaluator(credential, azure_ai_project),
SexualEvaluator(credential, azure_ai_project),
SelfHarmEvaluator(credential, azure_ai_project),
HateUnfairnessEvaluator(credential, azure_ai_project),
]
super().__init__(evaluators=evaluators, **kwargs)

@overload
def __call__(
Expand Down Expand Up @@ -109,36 +105,3 @@ def __call__( # pylint: disable=docstring-missing-param
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
"""
return super().__call__(*args, **kwargs)

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
"""Perform the evaluation using the Azure AI RAI service.
The exact evaluation performed is determined by the evaluation metric supplied
by the child class initializer.
:param eval_input: The input to the evaluation function.
:type eval_input: Dict
:return: The evaluation result.
:rtype: Dict
"""
query = eval_input.get("query", None)
response = eval_input.get("response", None)
conversation = eval_input.get("conversation", None)
results: Dict[str, Union[str, float]] = {}
# TODO fix this to not explode on empty optional inputs (PF SKD error)
if self._parallel:
with ThreadPoolExecutor() as executor:
# pylint: disable=no-value-for-parameter
futures = {
executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
for evaluator in self._evaluators
}

for future in as_completed(futures):
results.update(future.result())
else:
for evaluator in self._evaluators:
result = evaluator(query=query, response=response, conversation=conversation)
results.update(result)

return results
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from concurrent.futures import as_completed
from typing import Callable, Dict, List, Union
from typing import Union

from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
from typing_extensions import overload, override

from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase

from .._coherence import CoherenceEvaluator
from .._f1_score import F1ScoreEvaluator
Expand All @@ -15,7 +16,7 @@
from .._similarity import SimilarityEvaluator


class QAEvaluator:
class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
"""
Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
Expand Down Expand Up @@ -46,18 +47,39 @@ class QAEvaluator:
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

def __init__(self, model_config, **kwargs):
self._parallel = kwargs.pop("_parallel", False)

self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
evaluators = [
GroundednessEvaluator(model_config),
RelevanceEvaluator(model_config),
CoherenceEvaluator(model_config),
FluencyEvaluator(model_config),
SimilarityEvaluator(model_config),
F1ScoreEvaluator(),
]
super().__init__(evaluators=evaluators, **kwargs)

@overload # type: ignore
def __call__(self, *, query: str, response: str, context: str, ground_truth: str):
"""
Evaluates question-answering scenario.
:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:keyword context: The context to be evaluated.
:paramtype context: str
:keyword ground_truth: The ground truth to be evaluated.
:paramtype ground_truth: str
:return: The scores for QA scenario.
:rtype: Dict[str, Union[str, float]]
"""

def __call__(self, *, query: str, response: str, context: str, ground_truth: str, **kwargs):
@override
def __call__( # pylint: disable=docstring-missing-param
self,
*args,
**kwargs,
):
"""
Evaluates question-answering scenario.
Expand All @@ -72,22 +94,5 @@ def __call__(self, *, query: str, response: str, context: str, ground_truth: str
:return: The scores for QA scenario.
:rtype: Dict[str, Union[str, float]]
"""
results: Dict[str, Union[str, float]] = {}
if self._parallel:
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(
evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
): evaluator
for evaluator in self._evaluators
}

# Collect results as they complete
for future in as_completed(futures):
results.update(future.result())
else:
for evaluator in self._evaluators:
result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
results.update(result)

return results

return super().__call__(*args, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
RelevanceEvaluator,
SimilarityEvaluator,
GroundednessEvaluator,
# QAEvaluator,
QAEvaluator,
ContentSafetyEvaluator,
GroundednessProEvaluator,
ProtectedMaterialEvaluator,
Expand Down Expand Up @@ -88,7 +88,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
"fluency": FluencyEvaluator(model_config),
"relevance": RelevanceEvaluator(model_config),
"similarity": SimilarityEvaluator(model_config),
# "qa": QAEvaluator(model_config),
"qa": QAEvaluator(model_config),
"grounded_pro": GroundednessProEvaluator(azure_cred, project_scope),
"protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope),
"indirect_attack": IndirectAttackEvaluator(azure_cred, project_scope),
Expand All @@ -105,7 +105,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
row_result_df = pd.DataFrame(result["rows"])
metrics = result["metrics"]

assert len(row_result_df.keys()) == 48 # 63 with qa
assert len(row_result_df.keys()) == 63
assert len(row_result_df["inputs.query"]) == 3
assert len(row_result_df["inputs.context"]) == 3
assert len(row_result_df["inputs.response"]) == 3
Expand Down Expand Up @@ -154,23 +154,23 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
assert len(row_result_df["outputs.content_safety.violence"]) == 3
assert len(row_result_df["outputs.content_safety.violence_score"]) == 3
assert len(row_result_df["outputs.content_safety.violence_reason"]) == 3
# assert len(row_result_df["outputs.qa.f1_score"]) == 3
# assert len(row_result_df["outputs.qa.groundedness"]) == 3
# assert len(row_result_df["outputs.qa.gpt_groundedness"]) == 3
# assert len(row_result_df["outputs.qa.groundedness_reason"]) == 3
# assert len(row_result_df["outputs.qa.coherence"]) == 3
# assert len(row_result_df["outputs.qa.gpt_coherence"]) == 3
# assert len(row_result_df["outputs.qa.coherence_reason"]) == 3
# assert len(row_result_df["outputs.qa.fluency"]) == 3
# assert len(row_result_df["outputs.qa.gpt_fluency"]) == 3
# assert len(row_result_df["outputs.qa.fluency_reason"]) == 3
# assert len(row_result_df["outputs.qa.relevance"]) == 3
# assert len(row_result_df["outputs.qa.gpt_relevance"]) == 3
# assert len(row_result_df["outputs.qa.relevance_reason"]) == 3
# assert len(row_result_df["outputs.qa.similarity"]) == 3
# assert len(row_result_df["outputs.qa.gpt_similarity"]) == 3
assert len(row_result_df["outputs.qa.f1_score"]) == 3
assert len(row_result_df["outputs.qa.groundedness"]) == 3
assert len(row_result_df["outputs.qa.gpt_groundedness"]) == 3
assert len(row_result_df["outputs.qa.groundedness_reason"]) == 3
assert len(row_result_df["outputs.qa.coherence"]) == 3
assert len(row_result_df["outputs.qa.gpt_coherence"]) == 3
assert len(row_result_df["outputs.qa.coherence_reason"]) == 3
assert len(row_result_df["outputs.qa.fluency"]) == 3
assert len(row_result_df["outputs.qa.gpt_fluency"]) == 3
assert len(row_result_df["outputs.qa.fluency_reason"]) == 3
assert len(row_result_df["outputs.qa.relevance"]) == 3
assert len(row_result_df["outputs.qa.gpt_relevance"]) == 3
assert len(row_result_df["outputs.qa.relevance_reason"]) == 3
assert len(row_result_df["outputs.qa.similarity"]) == 3
assert len(row_result_df["outputs.qa.gpt_similarity"]) == 3

assert len(metrics.keys()) == 28 # 39 with qa
assert len(metrics.keys()) == 39
assert metrics["f1_score.f1_score"] >= 0
assert metrics["gleu.gleu_score"] >= 0
assert metrics["bleu.bleu_score"] >= 0
Expand Down Expand Up @@ -199,17 +199,17 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
assert metrics["protected_material.protected_material_defect_rate"] >= 0
assert metrics["indirect_attack.xpia_defect_rate"] >= 0
assert metrics["eci.eci_defect_rate"] >= 0
# assert metrics["qa.f1_score"] >= 0
# assert metrics["qa.groundedness"] >= 0
# assert metrics["qa.gpt_groundedness"] >= 0
# assert metrics["qa.coherence"] >= 0
# assert metrics["qa.gpt_coherence"] >= 0
# assert metrics["qa.fluency"] >= 0
# assert metrics["qa.gpt_fluency"] >= 0
# assert metrics["qa.relevance"] >= 0
# assert metrics["qa.gpt_relevance"] >= 0
# assert metrics["qa.similarity"] >= 0
# assert metrics["qa.gpt_similarity"] >= 0
assert metrics["qa.f1_score"] >= 0
assert metrics["qa.groundedness"] >= 0
assert metrics["qa.gpt_groundedness"] >= 0
assert metrics["qa.coherence"] >= 0
assert metrics["qa.gpt_coherence"] >= 0
assert metrics["qa.fluency"] >= 0
assert metrics["qa.gpt_fluency"] >= 0
assert metrics["qa.relevance"] >= 0
assert metrics["qa.gpt_relevance"] >= 0
assert metrics["qa.similarity"] >= 0
assert metrics["qa.gpt_similarity"] >= 0

def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred, project_scope):
evaluators = {
Expand Down

0 comments on commit db2c434

Please sign in to comment.