refactor math evals (Azure#38951)

* refactor math evals * fix tests, add fail flag to evaluate
kingernupur · Dec 23, 2024 · 8bff427 · 8bff427
1 parent 487d0b4
commit 8bff427
Show file tree

Hide file tree

Showing 8 changed files with 315 additions and 196 deletions.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -20,6 +20,7 @@
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
     EvaluationMetrics,
+    DefaultOpenEncoding,
     Prefixes,
     _InternalEvaluationMetrics,
 )
@@ -569,6 +570,7 @@ def evaluate(
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
+    fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> EvaluationResult:
     """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
@@ -594,6 +596,11 @@ def evaluate(
     :paramtype output_path: Optional[str]
     :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
     :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
+    :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
+        if ANY evaluator fails during their evaluation.
+        Defaults to false, which means that evaluations will continue regardless of failures.
+        If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
+    :paramtype fail_on_evaluator_errors: bool
     :return: Evaluation results.
     :rtype: ~azure.ai.evaluation.EvaluationResult
 
@@ -615,6 +622,7 @@ def evaluate(
             evaluator_config=evaluator_config,
             azure_ai_project=azure_ai_project,
             output_path=output_path,
+            fail_on_evaluator_errors=fail_on_evaluator_errors,
             **kwargs,
         )
     except Exception as e:
@@ -663,6 +671,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
         print("\n====================================================\n")
 
 
+def _print_fail_flag_warning() -> None:
+    print(
+        "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
+        + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
+        + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
+        + "without producing any outputs, since a single failure will cancel the entire run "
+        "when fail_on_evaluator_errors is enabled."
+    )
+
+
 def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
     evaluators: Dict[str, Callable],
@@ -672,8 +690,11 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
+    fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> EvaluationResult:
+    if fail_on_evaluator_errors:
+        _print_fail_flag_warning()
     input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
 
     # Process evaluator config to replace ${target.} with ${data.}
@@ -773,6 +794,10 @@ def eval_batch_run(
     evaluators_result_df = None
     evaluators_metric = {}
     for evaluator_name, evaluator_result in per_evaluator_results.items():
+        if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
+            _print_summary(per_evaluator_results)
+            _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
+
         evaluator_result_df = evaluator_result["result"]
 
         # drop input columns
@@ -825,3 +850,20 @@ def eval_batch_run(
         _write_output(output_path, result)
 
     return result
+
+
+def _turn_error_logs_into_exception(log_path: str) -> None:
+    """Produce an EvaluationException using the contents of the inputted
+    file as the error message.
+
+    :param log_path: The path to the error log file.
+    :type log_path: str
+    """
+    with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
+        error_message = file.read()
+    raise EvaluationException(
+        message=error_message,
+        target=ErrorTarget.EVALUATE,
+        category=ErrorCategory.FAILED_EXECUTION,
+        blame=ErrorBlame.UNKNOWN,
+    )
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py
@@ -1,30 +1,16 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
 from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common.utils import nltk_tokenize
 
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 
-class _AsyncBleuScoreEvaluator:
-    def __init__(self):
-        pass
-
-    async def __call__(self, *, response: str, ground_truth: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-
-        # NIST Smoothing
-        smoothing_function = SmoothingFunction().method4
-        score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
-
-        return {
-            "bleu_score": score,
-        }
 
-
-class BleuScoreEvaluator:
+class BleuScoreEvaluator(EvaluatorBase):
     """
     Calculate the BLEU score for a given response and ground truth.
 
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     def __init__(self):
-        self._async_evaluator = _AsyncBleuScoreEvaluator()
+        super().__init__()
+
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a glue score evaluation result.
+
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
 
-    def __call__(self, *, response: str, ground_truth: str, **kwargs):
+        # NIST Smoothing
+        smoothing_function = SmoothingFunction().method4
+        score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
+
+        return {
+            "bleu_score": score,
+        }
+
+    @overload  # type: ignore
+    def __call__(self, *, response: str, ground_truth: str):
         """
         Evaluate the BLEU score between the response and the ground truth.
 
@@ -64,9 +73,21 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
         :return: The BLEU score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
-        )
 
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the BLEU score between the response and the ground truth.
+
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The BLEU score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py
@@ -3,45 +3,44 @@
 # ---------------------------------------------------------
 
 from collections import Counter
-from typing import List
+from typing import List, Dict
+from typing_extensions import overload, override
 
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 
+class F1ScoreEvaluator(EvaluatorBase):
+    """
+    Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
 
-class _AsyncF1ScoreEvaluator:
-    def __init__(self):
-        pass
+    F1 Scores range from 0 to 1, with 1 being the best possible score.
 
-    async def __call__(self, *, response: str, ground_truth: str, **kwargs):
-        """
-        Evaluate F1 score.
+    The F1-score computes the ratio of the number of shared words between the model generation and
+    the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
+    truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
+    precision is the ratio of the number of shared words to the total number of words in the generation, and recall
+    is the ratio of the number of shared words to the total number of words in the ground truth.
 
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword ground_truth: The ground truth to be evaluated.
-        :paramtype ground_truth: str
-        :return: The F1 score.
-        :rtype: Dict[str, float]
-        """
-        # Validate inputs
-        if not (response and response.strip() and response != "None") or not (
-            ground_truth and ground_truth.strip() and ground_truth != "None"
-        ):
-            msg = "Both 'response' and 'ground_truth' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.F1_EVALUATOR,
-            )
+    Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
+    model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
+    information in the response.
 
-        # Run f1 score computation.
-        f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
 
-        return {"f1_score": f1_result}
+    .. admonition:: Example:
+
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START f1_score_evaluator]
+            :end-before: [END f1_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an F1ScoreEvaluator.
+    """
+
+    id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+
+    def __init__(self):
+        super().__init__()
 
     @classmethod
     def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
@@ -103,41 +102,24 @@ def lower(text):
 
         return f1
 
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce an f1 score evaluation result.
 
-class F1ScoreEvaluator:
-    """
-    Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
-
-    F1 Scores range from 0 to 1, with 1 being the best possible score.
-
-    The F1-score computes the ratio of the number of shared words between the model generation and
-    the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
-    truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
-    precision is the ratio of the number of shared words to the total number of words in the generation, and recall
-    is the ratio of the number of shared words to the total number of words in the ground truth.
-
-    Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
-    model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
-    information in the response.
-
-
-    .. admonition:: Example:
-
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START f1_score_evaluator]
-            :end-before: [END f1_score_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call an F1ScoreEvaluator.
-    """
-
-    id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        # Run f1 score computation.
+        f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
 
-    def __init__(self):
-        self._async_evaluator = _AsyncF1ScoreEvaluator()
+        return {"f1_score": f1_result}
 
-    def __call__(self, *, response: str, ground_truth: str, **kwargs):
+    @overload  # type: ignore
+    def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
         """
         Evaluate F1 score.
 
@@ -149,9 +131,20 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
         :rtype: Dict[str, float]
         """
 
-        return async_run_allowing_running_loop(
-            self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
-        )
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate F1 score.
 
-    def _to_async(self):
-        return self._async_evaluator
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The F1 score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)