From 8bff4275f1d09f4f6e28fa3288dc405c371e8f01 Mon Sep 17 00:00:00 2001
From: MilesHolland <108901744+MilesHolland@users.noreply.github.com>
Date: Mon, 23 Dec 2024 10:46:15 -0500
Subject: [PATCH] refactor math evals (#38951)

* refactor math evals

* fix tests, add fail flag to evaluate
---
 .../ai/evaluation/_evaluate/_evaluate.py      |  42 ++++++
 .../ai/evaluation/_evaluators/_bleu/_bleu.py  |  71 ++++++----
 .../_evaluators/_f1_score/_f1_score.py        | 129 +++++++++---------
 .../ai/evaluation/_evaluators/_gleu/_gleu.py  |  68 +++++----
 .../evaluation/_evaluators/_meteor/_meteor.py |  89 +++++++-----
 .../evaluation/_evaluators/_rouge/_rouge.py   |  67 +++++----
 .../tests/unittests/test_evaluate.py          |  35 +++--
 .../tests/unittests/test_save_eval.py         |  10 +-
 8 files changed, 315 insertions(+), 196 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
index 87ce23f0669a..a325e8c68c74 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -20,6 +20,7 @@
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
     EvaluationMetrics,
+    DefaultOpenEncoding,
     Prefixes,
     _InternalEvaluationMetrics,
 )
@@ -569,6 +570,7 @@ def evaluate(
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
+    fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> EvaluationResult:
     """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
@@ -594,6 +596,11 @@ def evaluate(
     :paramtype output_path: Optional[str]
     :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
     :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
+    :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
+        if ANY evaluator fails during their evaluation.
+        Defaults to false, which means that evaluations will continue regardless of failures.
+        If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
+    :paramtype fail_on_evaluator_errors: bool
     :return: Evaluation results.
     :rtype: ~azure.ai.evaluation.EvaluationResult
 
@@ -615,6 +622,7 @@ def evaluate(
             evaluator_config=evaluator_config,
             azure_ai_project=azure_ai_project,
             output_path=output_path,
+            fail_on_evaluator_errors=fail_on_evaluator_errors,
             **kwargs,
         )
     except Exception as e:
@@ -663,6 +671,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
         print("\n====================================================\n")
 
 
+def _print_fail_flag_warning() -> None:
+    print(
+        "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
+        + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
+        + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
+        + "without producing any outputs, since a single failure will cancel the entire run "
+        "when fail_on_evaluator_errors is enabled."
+    )
+
+
 def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
     evaluators: Dict[str, Callable],
@@ -672,8 +690,11 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
+    fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> EvaluationResult:
+    if fail_on_evaluator_errors:
+        _print_fail_flag_warning()
     input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
 
     # Process evaluator config to replace ${target.} with ${data.}
@@ -773,6 +794,10 @@ def eval_batch_run(
     evaluators_result_df = None
     evaluators_metric = {}
     for evaluator_name, evaluator_result in per_evaluator_results.items():
+        if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
+            _print_summary(per_evaluator_results)
+            _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
+
         evaluator_result_df = evaluator_result["result"]
 
         # drop input columns
@@ -825,3 +850,20 @@ def eval_batch_run(
         _write_output(output_path, result)
 
     return result
+
+
+def _turn_error_logs_into_exception(log_path: str) -> None:
+    """Produce an EvaluationException using the contents of the inputted
+    file as the error message.
+
+    :param log_path: The path to the error log file.
+    :type log_path: str
+    """
+    with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
+        error_message = file.read()
+    raise EvaluationException(
+        message=error_message,
+        target=ErrorTarget.EVALUATE,
+        category=ErrorCategory.FAILED_EXECUTION,
+        blame=ErrorBlame.UNKNOWN,
+    )
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py
index 9f1b2d78165a..f651bfc14f6b 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py
@@ -1,30 +1,16 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
 from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common.utils import nltk_tokenize
 
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 
-class _AsyncBleuScoreEvaluator:
-    def __init__(self):
-        pass
-
-    async def __call__(self, *, response: str, ground_truth: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-
-        # NIST Smoothing
-        smoothing_function = SmoothingFunction().method4
-        score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
-
-        return {
-            "bleu_score": score,
-        }
 
-
-class BleuScoreEvaluator:
+class BleuScoreEvaluator(EvaluatorBase):
     """
     Calculate the BLEU score for a given response and ground truth.
 
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     def __init__(self):
-        self._async_evaluator = _AsyncBleuScoreEvaluator()
+        super().__init__()
+
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a glue score evaluation result.
+
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
 
-    def __call__(self, *, response: str, ground_truth: str, **kwargs):
+        # NIST Smoothing
+        smoothing_function = SmoothingFunction().method4
+        score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
+
+        return {
+            "bleu_score": score,
+        }
+
+    @overload  # type: ignore
+    def __call__(self, *, response: str, ground_truth: str):
         """
         Evaluate the BLEU score between the response and the ground truth.
 
@@ -64,9 +73,21 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
         :return: The BLEU score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
-        )
 
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the BLEU score between the response and the ground truth.
+
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The BLEU score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py
index 3d7c3708336a..3faed440f25b 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py
@@ -3,45 +3,44 @@
 # ---------------------------------------------------------
 
 from collections import Counter
-from typing import List
+from typing import List, Dict
+from typing_extensions import overload, override
 
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 
+class F1ScoreEvaluator(EvaluatorBase):
+    """
+    Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
 
-class _AsyncF1ScoreEvaluator:
-    def __init__(self):
-        pass
+    F1 Scores range from 0 to 1, with 1 being the best possible score.
 
-    async def __call__(self, *, response: str, ground_truth: str, **kwargs):
-        """
-        Evaluate F1 score.
+    The F1-score computes the ratio of the number of shared words between the model generation and
+    the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
+    truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
+    precision is the ratio of the number of shared words to the total number of words in the generation, and recall
+    is the ratio of the number of shared words to the total number of words in the ground truth.
 
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword ground_truth: The ground truth to be evaluated.
-        :paramtype ground_truth: str
-        :return: The F1 score.
-        :rtype: Dict[str, float]
-        """
-        # Validate inputs
-        if not (response and response.strip() and response != "None") or not (
-            ground_truth and ground_truth.strip() and ground_truth != "None"
-        ):
-            msg = "Both 'response' and 'ground_truth' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.F1_EVALUATOR,
-            )
+    Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
+    model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
+    information in the response.
 
-        # Run f1 score computation.
-        f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
 
-        return {"f1_score": f1_result}
+    .. admonition:: Example:
+
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START f1_score_evaluator]
+            :end-before: [END f1_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an F1ScoreEvaluator.
+    """
+
+    id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+
+    def __init__(self):
+        super().__init__()
 
     @classmethod
     def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
@@ -103,41 +102,24 @@ def lower(text):
 
         return f1
 
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce an f1 score evaluation result.
 
-class F1ScoreEvaluator:
-    """
-    Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
-
-    F1 Scores range from 0 to 1, with 1 being the best possible score.
-
-    The F1-score computes the ratio of the number of shared words between the model generation and
-    the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
-    truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
-    precision is the ratio of the number of shared words to the total number of words in the generation, and recall
-    is the ratio of the number of shared words to the total number of words in the ground truth.
-
-    Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
-    model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
-    information in the response.
-
-
-    .. admonition:: Example:
-
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START f1_score_evaluator]
-            :end-before: [END f1_score_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call an F1ScoreEvaluator.
-    """
-
-    id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        # Run f1 score computation.
+        f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
 
-    def __init__(self):
-        self._async_evaluator = _AsyncF1ScoreEvaluator()
+        return {"f1_score": f1_result}
 
-    def __call__(self, *, response: str, ground_truth: str, **kwargs):
+    @overload  # type: ignore
+    def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
         """
         Evaluate F1 score.
 
@@ -149,9 +131,20 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
         :rtype: Dict[str, float]
         """
 
-        return async_run_allowing_running_loop(
-            self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
-        )
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate F1 score.
 
-    def _to_async(self):
-        return self._async_evaluator
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The F1 score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_gleu/_gleu.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_gleu/_gleu.py
index 059f1b7b952a..5416635eb552 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_gleu/_gleu.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_gleu/_gleu.py
@@ -1,28 +1,16 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
 from nltk.translate.gleu_score import sentence_gleu
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common.utils import nltk_tokenize
 
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 
-class _AsyncGleuScoreEvaluator:
-    def __init__(self):
-        pass
-
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-
-        score = sentence_gleu([reference_tokens], hypothesis_tokens)
-
-        return {
-            "gleu_score": score,
-        }
 
-
-class GleuScoreEvaluator:
+class GleuScoreEvaluator(EvaluatorBase):
     """
     Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
 
@@ -47,10 +35,32 @@ class GleuScoreEvaluator:
     id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
+    @override
     def __init__(self):
-        self._async_evaluator = _AsyncGleuScoreEvaluator()
+        super().__init__()
+
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a glue score evaluation result.
+
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
 
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+        score = sentence_gleu([reference_tokens], hypothesis_tokens)
+
+        return {
+            "gleu_score": score,
+        }
+
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str):
         """
         Evaluate the GLEU score between the response and the ground truth.
 
@@ -61,9 +71,21 @@ def __call__(self, *, ground_truth: str, response: str, **kwargs):
         :return: The GLEU score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
 
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the GLEU score between the response and the ground truth.
+
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The GLEU score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_meteor/_meteor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_meteor/_meteor.py
index 82af6116b3ff..c421c210480c 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_meteor/_meteor.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_meteor/_meteor.py
@@ -1,38 +1,16 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
+
 from nltk.translate.meteor_score import meteor_score
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 
 
-class _AsyncMeteorScoreEvaluator:
-    def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
-        self._alpha = alpha
-        self._beta = beta
-        self._gamma = gamma
-
-        ensure_nltk_data_downloaded()
-
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-
-        score = meteor_score(
-            [reference_tokens],
-            hypothesis_tokens,
-            alpha=self._alpha,
-            beta=self._beta,
-            gamma=self._gamma,
-        )
-
-        return {
-            "meteor_score": score,
-        }
-
-
-class MeteorScoreEvaluator:
+class MeteorScoreEvaluator(EvaluatorBase):
     """
     Calculates the METEOR score for a given response and ground truth.
 
@@ -68,10 +46,41 @@ class MeteorScoreEvaluator:
     id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
+    @override
     def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
-        self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
+        self._alpha = alpha
+        self._beta = beta
+        self._gamma = gamma
+        ensure_nltk_data_downloaded()
+        super().__init__()
 
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a meteor score evaluation result.
+
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
+        score = meteor_score(
+            [reference_tokens],
+            hypothesis_tokens,
+            alpha=self._alpha,
+            beta=self._beta,
+            gamma=self._gamma,
+        )
+
+        return {
+            "meteor_score": score,
+        }
+
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
         """
         Evaluate the METEOR score between the response and the ground truth.
 
@@ -82,9 +91,21 @@ def __call__(self, *, ground_truth: str, response: str, **kwargs):
         :return: The METEOR score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
 
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the METEOR score between the response and the ground truth.
+
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The METEOR score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py
index 458786b812e3..4298be9127fa 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py
@@ -3,9 +3,11 @@
 # ---------------------------------------------------------
 from enum import Enum
 
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing import Dict
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 
 
 class RougeType(Enum):
@@ -32,21 +34,7 @@ class RougeType(Enum):
     """Overlap of L-grams (L consecutive words) between generated and reference text."""
 
 
-class _AsyncRougeScoreEvaluator:
-    def __init__(self, rouge_type: RougeType):
-        self._rouge_type = rouge_type
-
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
-        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
-        return {
-            "rouge_precision": metrics.precision,
-            "rouge_recall": metrics.recall,
-            "rouge_f1_score": metrics.fmeasure,
-        }
-
-
-class RougeScoreEvaluator:
+class RougeScoreEvaluator(EvaluatorBase):
     """
     Calculates the ROUGE score for a given response and ground truth.
 
@@ -76,10 +64,32 @@ class RougeScoreEvaluator:
     id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
+    @override
     def __init__(self, rouge_type: RougeType):
-        self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
+        self._rouge_type = rouge_type
+        super().__init__()
+
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a rouge score evaluation result.
 
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
+        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
+        return {
+            "rouge_precision": metrics.precision,
+            "rouge_recall": metrics.recall,
+            "rouge_f1_score": metrics.fmeasure,
+        }
+
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
         """
         Evaluate the ROUGE score between the response and the ground truth.
 
@@ -90,9 +100,20 @@ def __call__(self, *, ground_truth: str, response: str, **kwargs):
         :return: The ROUGE score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
 
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate route score.
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The ROUGE score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
index 189fb81617b0..9095cd1ac960 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -161,10 +161,13 @@ def test_evaluate_invalid_jsonl_data(self, mock_model_config, invalid_jsonl_file
 
     def test_evaluate_missing_required_inputs(self, missing_columns_jsonl_file):
         with pytest.raises(EvaluationException) as exc_info:
-            evaluate(data=missing_columns_jsonl_file, evaluators={"g": F1ScoreEvaluator()})
-
-        expected_message = "Some evaluators are missing required inputs:\n" "- g: ['ground_truth']\n"
+            evaluate(
+                data=missing_columns_jsonl_file, evaluators={"g": F1ScoreEvaluator()}, fail_on_evaluator_errors=True
+            )
+        expected_message = "Either 'conversation' or individual inputs must be provided."
         assert expected_message in exc_info.value.args[0]
+        # Same call without failure flag shouldn't produce an exception.
+        evaluate(data=missing_columns_jsonl_file, evaluators={"g": F1ScoreEvaluator()})
 
     def test_evaluate_missing_required_inputs_target(self, questions_wrong_file):
         with pytest.raises(EvaluationException) as exc_info:
@@ -174,15 +177,19 @@ def test_evaluate_missing_required_inputs_target(self, questions_wrong_file):
     def test_target_not_generate_required_columns(self, questions_file):
         with pytest.raises(EvaluationException) as exc_info:
             # target_fn will generate the "response", but not "ground_truth".
-            evaluate(data=questions_file, evaluators={"g": F1ScoreEvaluator()}, target=_target_fn)
-
-        expected_message = "Some evaluators are missing required inputs:\n" "- g: ['ground_truth']\n"
+            evaluate(
+                data=questions_file,
+                evaluators={"g": F1ScoreEvaluator()},
+                target=_target_fn,
+                fail_on_evaluator_errors=True,
+            )
 
-        expected_message2 = "Verify that the target is generating the necessary columns for the evaluators. "
-        expected_message2 += "Currently generated columns: {'response'}"
+        expected_message = "Either 'conversation' or individual inputs must be provided."
 
         assert expected_message in exc_info.value.args[0]
-        assert expected_message2 in exc_info.value.args[0]
+
+        # Same call without failure flag shouldn't produce an exception.
+        evaluate(data=questions_file, evaluators={"g": F1ScoreEvaluator()}, target=_target_fn)
 
     def test_target_raises_on_outputs(self):
         """Test we are raising exception if the output is column is present in the input."""
@@ -674,13 +681,3 @@ def test_optional_inputs_with_target(self, questions_file, questions_answers_bas
         )  # type: ignore
         assert double_override_results["rows"][0]["outputs.echo.echo_query"] == "new query"
         assert double_override_results["rows"][0]["outputs.echo.echo_response"] == "new response"
-
-    def test_missing_inputs(self, questions_file):
-        """Test we are raising exception if required input is missing in data."""
-        with pytest.raises(EvaluationException) as cm:
-            evaluate(
-                data=questions_file,
-                target=_target_fn,
-                evaluators={"f1": F1ScoreEvaluator()},
-            )
-        assert "Some evaluators are missing required inputs:\n- f1: ['ground_truth']\n\n" in cm.value.args[0]
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py
index 8f773f6f9a92..8b6a094d23f3 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py
@@ -39,11 +39,13 @@ def test_save_evaluators(self, tmpdir, pf_client, evaluator) -> None:
 
     def test_load_and_run_evaluators(self, tmpdir, pf_client, data_file) -> None:
         """Test regular evaluator saving."""
-        from azure.ai.evaluation import F1ScoreEvaluator
+        # Use a test eval because save/load feature breaks, seemingly in multiple ways, when
+        # evaluators have complex imports.
+        from test_evaluators.test_inputs_evaluators import EchoEval
 
-        pf_client.flows.save(F1ScoreEvaluator, path=tmpdir)
+        pf_client.flows.save(EchoEval, path=tmpdir)
         run = pf_client.run(tmpdir, data=data_file)
         results_df = pf_client.get_details(run.name)
-
         assert results_df is not None
-        assert results_df["outputs.f1_score"].notnull().all()
+        all(results_df["outputs.echo_query"] == results_df["inputs.query"])
+        all(results_df["outputs.echo_response"] == results_df["inputs.response"])