From 8bff4275f1d09f4f6e28fa3288dc405c371e8f01 Mon Sep 17 00:00:00 2001 From: MilesHolland <108901744+MilesHolland@users.noreply.github.com> Date: Mon, 23 Dec 2024 10:46:15 -0500 Subject: [PATCH] refactor math evals (#38951) * refactor math evals * fix tests, add fail flag to evaluate --- .../ai/evaluation/_evaluate/_evaluate.py | 42 ++++++ .../ai/evaluation/_evaluators/_bleu/_bleu.py | 71 ++++++---- .../_evaluators/_f1_score/_f1_score.py | 129 +++++++++--------- .../ai/evaluation/_evaluators/_gleu/_gleu.py | 68 +++++---- .../evaluation/_evaluators/_meteor/_meteor.py | 89 +++++++----- .../evaluation/_evaluators/_rouge/_rouge.py | 67 +++++---- .../tests/unittests/test_evaluate.py | 35 +++-- .../tests/unittests/test_save_eval.py | 10 +- 8 files changed, 315 insertions(+), 196 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 87ce23f0669a..a325e8c68c74 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -20,6 +20,7 @@ from .._constants import ( CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, + DefaultOpenEncoding, Prefixes, _InternalEvaluationMetrics, ) @@ -569,6 +570,7 @@ def evaluate( evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None, azure_ai_project: Optional[AzureAIProject] = None, output_path: Optional[Union[str, os.PathLike]] = None, + fail_on_evaluator_errors: bool = False, **kwargs, ) -> EvaluationResult: """Evaluates target or data with built-in or custom evaluators. If both target and data are provided, @@ -594,6 +596,11 @@ def evaluate( :paramtype output_path: Optional[str] :keyword azure_ai_project: Logs evaluation results to AI Studio if set. :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject] + :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException + if ANY evaluator fails during their evaluation. + Defaults to false, which means that evaluations will continue regardless of failures. + If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs. + :paramtype fail_on_evaluator_errors: bool :return: Evaluation results. :rtype: ~azure.ai.evaluation.EvaluationResult @@ -615,6 +622,7 @@ def evaluate( evaluator_config=evaluator_config, azure_ai_project=azure_ai_project, output_path=output_path, + fail_on_evaluator_errors=fail_on_evaluator_errors, **kwargs, ) except Exception as e: @@ -663,6 +671,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None: print("\n====================================================\n") +def _print_fail_flag_warning() -> None: + print( + "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable " + + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, " + + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing " + + "without producing any outputs, since a single failure will cancel the entire run " + "when fail_on_evaluator_errors is enabled." + ) + + def _evaluate( # pylint: disable=too-many-locals,too-many-statements *, evaluators: Dict[str, Callable], @@ -672,8 +690,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None, azure_ai_project: Optional[AzureAIProject] = None, output_path: Optional[Union[str, os.PathLike]] = None, + fail_on_evaluator_errors: bool = False, **kwargs, ) -> EvaluationResult: + if fail_on_evaluator_errors: + _print_fail_flag_warning() input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name) # Process evaluator config to replace ${target.} with ${data.} @@ -773,6 +794,10 @@ def eval_batch_run( evaluators_result_df = None evaluators_metric = {} for evaluator_name, evaluator_result in per_evaluator_results.items(): + if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0: + _print_summary(per_evaluator_results) + _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json") + evaluator_result_df = evaluator_result["result"] # drop input columns @@ -825,3 +850,20 @@ def eval_batch_run( _write_output(output_path, result) return result + + +def _turn_error_logs_into_exception(log_path: str) -> None: + """Produce an EvaluationException using the contents of the inputted + file as the error message. + + :param log_path: The path to the error log file. + :type log_path: str + """ + with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file: + error_message = file.read() + raise EvaluationException( + message=error_message, + target=ErrorTarget.EVALUATE, + category=ErrorCategory.FAILED_EXECUTION, + blame=ErrorBlame.UNKNOWN, + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py index 9f1b2d78165a..f651bfc14f6b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py @@ -1,30 +1,16 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +from typing import Dict from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu -from promptflow._utils.async_utils import async_run_allowing_running_loop +from typing_extensions import overload, override from azure.ai.evaluation._common.utils import nltk_tokenize +from azure.ai.evaluation._evaluators._common import EvaluatorBase -class _AsyncBleuScoreEvaluator: - def __init__(self): - pass - - async def __call__(self, *, response: str, ground_truth: str, **kwargs): - reference_tokens = nltk_tokenize(ground_truth) - hypothesis_tokens = nltk_tokenize(response) - - # NIST Smoothing - smoothing_function = SmoothingFunction().method4 - score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function) - - return { - "bleu_score": score, - } - -class BleuScoreEvaluator: +class BleuScoreEvaluator(EvaluatorBase): """ Calculate the BLEU score for a given response and ground truth. @@ -51,9 +37,32 @@ class BleuScoreEvaluator: """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" def __init__(self): - self._async_evaluator = _AsyncBleuScoreEvaluator() + super().__init__() + + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, float]: + """Produce a glue score evaluation result. + + :param eval_input: The input to the evaluation function. + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + ground_truth = eval_input["ground_truth"] + response = eval_input["response"] + reference_tokens = nltk_tokenize(ground_truth) + hypothesis_tokens = nltk_tokenize(response) - def __call__(self, *, response: str, ground_truth: str, **kwargs): + # NIST Smoothing + smoothing_function = SmoothingFunction().method4 + score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function) + + return { + "bleu_score": score, + } + + @overload # type: ignore + def __call__(self, *, response: str, ground_truth: str): """ Evaluate the BLEU score between the response and the ground truth. @@ -64,9 +73,21 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs): :return: The BLEU score. :rtype: Dict[str, float] """ - return async_run_allowing_running_loop( - self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs - ) - def _to_async(self): - return self._async_evaluator + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): + """ + Evaluate the BLEU score between the response and the ground truth. + + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword ground_truth: The ground truth to be compared against. + :paramtype ground_truth: str + :return: The BLEU score. + :rtype: Dict[str, float] + """ + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py index 3d7c3708336a..3faed440f25b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py @@ -3,45 +3,44 @@ # --------------------------------------------------------- from collections import Counter -from typing import List +from typing import List, Dict +from typing_extensions import overload, override -from promptflow._utils.async_utils import async_run_allowing_running_loop +from azure.ai.evaluation._evaluators._common import EvaluatorBase -from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException +class F1ScoreEvaluator(EvaluatorBase): + """ + Calculates the F1 score for a given response and ground truth or a multi-turn conversation. -class _AsyncF1ScoreEvaluator: - def __init__(self): - pass + F1 Scores range from 0 to 1, with 1 being the best possible score. - async def __call__(self, *, response: str, ground_truth: str, **kwargs): - """ - Evaluate F1 score. + The F1-score computes the ratio of the number of shared words between the model generation and + the ground truth. Ratio is computed over the individual words in the generated response against those in the ground + truth answer. The number of shared words between the generation and the truth is the basis of the F1 score: + precision is the ratio of the number of shared words to the total number of words in the generation, and recall + is the ratio of the number of shared words to the total number of words in the ground truth. - :keyword response: The response to be evaluated. - :paramtype response: str - :keyword ground_truth: The ground truth to be evaluated. - :paramtype ground_truth: str - :return: The F1 score. - :rtype: Dict[str, float] - """ - # Validate inputs - if not (response and response.strip() and response != "None") or not ( - ground_truth and ground_truth.strip() and ground_truth != "None" - ): - msg = "Both 'response' and 'ground_truth' must be non-empty strings." - raise EvaluationException( - message=msg, - internal_message=msg, - error_category=ErrorCategory.MISSING_FIELD, - error_blame=ErrorBlame.USER_ERROR, - error_target=ErrorTarget.F1_EVALUATOR, - ) + Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your + model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate + information in the response. - # Run f1 score computation. - f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth) - return {"f1_score": f1_result} + .. admonition:: Example: + + .. literalinclude:: ../samples/evaluation_samples_evaluate.py + :start-after: [START f1_score_evaluator] + :end-before: [END f1_score_evaluator] + :language: python + :dedent: 8 + :caption: Initialize and call an F1ScoreEvaluator. + """ + + id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3" + """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + + def __init__(self): + super().__init__() @classmethod def _compute_f1_score(cls, response: str, ground_truth: str) -> float: @@ -103,41 +102,24 @@ def lower(text): return f1 + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, float]: + """Produce an f1 score evaluation result. -class F1ScoreEvaluator: - """ - Calculates the F1 score for a given response and ground truth or a multi-turn conversation. - - F1 Scores range from 0 to 1, with 1 being the best possible score. - - The F1-score computes the ratio of the number of shared words between the model generation and - the ground truth. Ratio is computed over the individual words in the generated response against those in the ground - truth answer. The number of shared words between the generation and the truth is the basis of the F1 score: - precision is the ratio of the number of shared words to the total number of words in the generation, and recall - is the ratio of the number of shared words to the total number of words in the ground truth. - - Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your - model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate - information in the response. - - - .. admonition:: Example: - - .. literalinclude:: ../samples/evaluation_samples_evaluate.py - :start-after: [START f1_score_evaluator] - :end-before: [END f1_score_evaluator] - :language: python - :dedent: 8 - :caption: Initialize and call an F1ScoreEvaluator. - """ - - id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3" - """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + :param eval_input: The input to the evaluation function. + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + ground_truth = eval_input["ground_truth"] + response = eval_input["response"] + # Run f1 score computation. + f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth) - def __init__(self): - self._async_evaluator = _AsyncF1ScoreEvaluator() + return {"f1_score": f1_result} - def __call__(self, *, response: str, ground_truth: str, **kwargs): + @overload # type: ignore + def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]: """ Evaluate F1 score. @@ -149,9 +131,20 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs): :rtype: Dict[str, float] """ - return async_run_allowing_running_loop( - self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs - ) + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): + """ + Evaluate F1 score. - def _to_async(self): - return self._async_evaluator + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword ground_truth: The ground truth to be evaluated. + :paramtype ground_truth: str + :return: The F1 score. + :rtype: Dict[str, float] + """ + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_gleu/_gleu.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_gleu/_gleu.py index 059f1b7b952a..5416635eb552 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_gleu/_gleu.py @@ -1,28 +1,16 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +from typing import Dict from nltk.translate.gleu_score import sentence_gleu -from promptflow._utils.async_utils import async_run_allowing_running_loop +from typing_extensions import overload, override from azure.ai.evaluation._common.utils import nltk_tokenize +from azure.ai.evaluation._evaluators._common import EvaluatorBase -class _AsyncGleuScoreEvaluator: - def __init__(self): - pass - - async def __call__(self, *, ground_truth: str, response: str, **kwargs): - reference_tokens = nltk_tokenize(ground_truth) - hypothesis_tokens = nltk_tokenize(response) - - score = sentence_gleu([reference_tokens], hypothesis_tokens) - - return { - "gleu_score": score, - } - -class GleuScoreEvaluator: +class GleuScoreEvaluator(EvaluatorBase): """ Calculates the GLEU (Google-BLEU) score between a response and the ground truth. @@ -47,10 +35,32 @@ class GleuScoreEvaluator: id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + @override def __init__(self): - self._async_evaluator = _AsyncGleuScoreEvaluator() + super().__init__() + + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, float]: + """Produce a glue score evaluation result. + + :param eval_input: The input to the evaluation function. + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + ground_truth = eval_input["ground_truth"] + response = eval_input["response"] + reference_tokens = nltk_tokenize(ground_truth) + hypothesis_tokens = nltk_tokenize(response) - def __call__(self, *, ground_truth: str, response: str, **kwargs): + score = sentence_gleu([reference_tokens], hypothesis_tokens) + + return { + "gleu_score": score, + } + + @overload # type: ignore + def __call__(self, *, ground_truth: str, response: str): """ Evaluate the GLEU score between the response and the ground truth. @@ -61,9 +71,21 @@ def __call__(self, *, ground_truth: str, response: str, **kwargs): :return: The GLEU score. :rtype: Dict[str, float] """ - return async_run_allowing_running_loop( - self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs - ) - def _to_async(self): - return self._async_evaluator + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): + """ + Evaluate the GLEU score between the response and the ground truth. + + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword ground_truth: The ground truth to be compared against. + :paramtype ground_truth: str + :return: The GLEU score. + :rtype: Dict[str, float] + """ + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_meteor/_meteor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_meteor/_meteor.py index 82af6116b3ff..c421c210480c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_meteor/_meteor.py @@ -1,38 +1,16 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +from typing import Dict + from nltk.translate.meteor_score import meteor_score -from promptflow._utils.async_utils import async_run_allowing_running_loop +from typing_extensions import overload, override from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded +from azure.ai.evaluation._evaluators._common import EvaluatorBase -class _AsyncMeteorScoreEvaluator: - def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5): - self._alpha = alpha - self._beta = beta - self._gamma = gamma - - ensure_nltk_data_downloaded() - - async def __call__(self, *, ground_truth: str, response: str, **kwargs): - reference_tokens = nltk_tokenize(ground_truth) - hypothesis_tokens = nltk_tokenize(response) - - score = meteor_score( - [reference_tokens], - hypothesis_tokens, - alpha=self._alpha, - beta=self._beta, - gamma=self._gamma, - ) - - return { - "meteor_score": score, - } - - -class MeteorScoreEvaluator: +class MeteorScoreEvaluator(EvaluatorBase): """ Calculates the METEOR score for a given response and ground truth. @@ -68,10 +46,41 @@ class MeteorScoreEvaluator: id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + @override def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5): - self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma) + self._alpha = alpha + self._beta = beta + self._gamma = gamma + ensure_nltk_data_downloaded() + super().__init__() - def __call__(self, *, ground_truth: str, response: str, **kwargs): + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, float]: + """Produce a meteor score evaluation result. + + :param eval_input: The input to the evaluation function. + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + ground_truth = eval_input["ground_truth"] + response = eval_input["response"] + reference_tokens = nltk_tokenize(ground_truth) + hypothesis_tokens = nltk_tokenize(response) + score = meteor_score( + [reference_tokens], + hypothesis_tokens, + alpha=self._alpha, + beta=self._beta, + gamma=self._gamma, + ) + + return { + "meteor_score": score, + } + + @overload # type: ignore + def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]: """ Evaluate the METEOR score between the response and the ground truth. @@ -82,9 +91,21 @@ def __call__(self, *, ground_truth: str, response: str, **kwargs): :return: The METEOR score. :rtype: Dict[str, float] """ - return async_run_allowing_running_loop( - self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs - ) - def _to_async(self): - return self._async_evaluator + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): + """ + Evaluate the METEOR score between the response and the ground truth. + + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword ground_truth: The ground truth to be compared against. + :paramtype ground_truth: str + :return: The METEOR score. + :rtype: Dict[str, float] + """ + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py index 458786b812e3..4298be9127fa 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py @@ -3,9 +3,11 @@ # --------------------------------------------------------- from enum import Enum -from promptflow._utils.async_utils import async_run_allowing_running_loop +from typing import Dict +from typing_extensions import overload, override from azure.ai.evaluation._vendor.rouge_score import rouge_scorer +from azure.ai.evaluation._evaluators._common import EvaluatorBase class RougeType(Enum): @@ -32,21 +34,7 @@ class RougeType(Enum): """Overlap of L-grams (L consecutive words) between generated and reference text.""" -class _AsyncRougeScoreEvaluator: - def __init__(self, rouge_type: RougeType): - self._rouge_type = rouge_type - - async def __call__(self, *, ground_truth: str, response: str, **kwargs): - scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value]) - metrics = scorer.score(ground_truth, response)[self._rouge_type.value] - return { - "rouge_precision": metrics.precision, - "rouge_recall": metrics.recall, - "rouge_f1_score": metrics.fmeasure, - } - - -class RougeScoreEvaluator: +class RougeScoreEvaluator(EvaluatorBase): """ Calculates the ROUGE score for a given response and ground truth. @@ -76,10 +64,32 @@ class RougeScoreEvaluator: id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + @override def __init__(self, rouge_type: RougeType): - self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type) + self._rouge_type = rouge_type + super().__init__() + + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, float]: + """Produce a rouge score evaluation result. - def __call__(self, *, ground_truth: str, response: str, **kwargs): + :param eval_input: The input to the evaluation function. + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + ground_truth = eval_input["ground_truth"] + response = eval_input["response"] + scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value]) + metrics = scorer.score(ground_truth, response)[self._rouge_type.value] + return { + "rouge_precision": metrics.precision, + "rouge_recall": metrics.recall, + "rouge_f1_score": metrics.fmeasure, + } + + @overload # type: ignore + def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]: """ Evaluate the ROUGE score between the response and the ground truth. @@ -90,9 +100,20 @@ def __call__(self, *, ground_truth: str, response: str, **kwargs): :return: The ROUGE score. :rtype: Dict[str, float] """ - return async_run_allowing_running_loop( - self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs - ) - def _to_async(self): - return self._async_evaluator + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): + """ + Evaluate route score. + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword ground_truth: The ground truth to be compared against. + :paramtype ground_truth: str + :return: The ROUGE score. + :rtype: Dict[str, float] + """ + return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 189fb81617b0..9095cd1ac960 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -161,10 +161,13 @@ def test_evaluate_invalid_jsonl_data(self, mock_model_config, invalid_jsonl_file def test_evaluate_missing_required_inputs(self, missing_columns_jsonl_file): with pytest.raises(EvaluationException) as exc_info: - evaluate(data=missing_columns_jsonl_file, evaluators={"g": F1ScoreEvaluator()}) - - expected_message = "Some evaluators are missing required inputs:\n" "- g: ['ground_truth']\n" + evaluate( + data=missing_columns_jsonl_file, evaluators={"g": F1ScoreEvaluator()}, fail_on_evaluator_errors=True + ) + expected_message = "Either 'conversation' or individual inputs must be provided." assert expected_message in exc_info.value.args[0] + # Same call without failure flag shouldn't produce an exception. + evaluate(data=missing_columns_jsonl_file, evaluators={"g": F1ScoreEvaluator()}) def test_evaluate_missing_required_inputs_target(self, questions_wrong_file): with pytest.raises(EvaluationException) as exc_info: @@ -174,15 +177,19 @@ def test_evaluate_missing_required_inputs_target(self, questions_wrong_file): def test_target_not_generate_required_columns(self, questions_file): with pytest.raises(EvaluationException) as exc_info: # target_fn will generate the "response", but not "ground_truth". - evaluate(data=questions_file, evaluators={"g": F1ScoreEvaluator()}, target=_target_fn) - - expected_message = "Some evaluators are missing required inputs:\n" "- g: ['ground_truth']\n" + evaluate( + data=questions_file, + evaluators={"g": F1ScoreEvaluator()}, + target=_target_fn, + fail_on_evaluator_errors=True, + ) - expected_message2 = "Verify that the target is generating the necessary columns for the evaluators. " - expected_message2 += "Currently generated columns: {'response'}" + expected_message = "Either 'conversation' or individual inputs must be provided." assert expected_message in exc_info.value.args[0] - assert expected_message2 in exc_info.value.args[0] + + # Same call without failure flag shouldn't produce an exception. + evaluate(data=questions_file, evaluators={"g": F1ScoreEvaluator()}, target=_target_fn) def test_target_raises_on_outputs(self): """Test we are raising exception if the output is column is present in the input.""" @@ -674,13 +681,3 @@ def test_optional_inputs_with_target(self, questions_file, questions_answers_bas ) # type: ignore assert double_override_results["rows"][0]["outputs.echo.echo_query"] == "new query" assert double_override_results["rows"][0]["outputs.echo.echo_response"] == "new response" - - def test_missing_inputs(self, questions_file): - """Test we are raising exception if required input is missing in data.""" - with pytest.raises(EvaluationException) as cm: - evaluate( - data=questions_file, - target=_target_fn, - evaluators={"f1": F1ScoreEvaluator()}, - ) - assert "Some evaluators are missing required inputs:\n- f1: ['ground_truth']\n\n" in cm.value.args[0] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py index 8f773f6f9a92..8b6a094d23f3 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py @@ -39,11 +39,13 @@ def test_save_evaluators(self, tmpdir, pf_client, evaluator) -> None: def test_load_and_run_evaluators(self, tmpdir, pf_client, data_file) -> None: """Test regular evaluator saving.""" - from azure.ai.evaluation import F1ScoreEvaluator + # Use a test eval because save/load feature breaks, seemingly in multiple ways, when + # evaluators have complex imports. + from test_evaluators.test_inputs_evaluators import EchoEval - pf_client.flows.save(F1ScoreEvaluator, path=tmpdir) + pf_client.flows.save(EchoEval, path=tmpdir) run = pf_client.run(tmpdir, data=data_file) results_df = pf_client.get_details(run.name) - assert results_df is not None - assert results_df["outputs.f1_score"].notnull().all() + all(results_df["outputs.echo_query"] == results_df["inputs.query"]) + all(results_df["outputs.echo_response"] == results_df["inputs.response"])