Skip to content

Commit

Permalink
refactor math evals (Azure#38951)
Browse files Browse the repository at this point in the history
* refactor math evals

* fix tests, add fail flag to evaluate
  • Loading branch information
MilesHolland authored Dec 23, 2024
1 parent 487d0b4 commit 8bff427
Show file tree
Hide file tree
Showing 8 changed files with 315 additions and 196 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .._constants import (
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
EvaluationMetrics,
DefaultOpenEncoding,
Prefixes,
_InternalEvaluationMetrics,
)
Expand Down Expand Up @@ -569,6 +570,7 @@ def evaluate(
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
azure_ai_project: Optional[AzureAIProject] = None,
output_path: Optional[Union[str, os.PathLike]] = None,
fail_on_evaluator_errors: bool = False,
**kwargs,
) -> EvaluationResult:
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
Expand All @@ -594,6 +596,11 @@ def evaluate(
:paramtype output_path: Optional[str]
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
if ANY evaluator fails during their evaluation.
Defaults to false, which means that evaluations will continue regardless of failures.
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
:paramtype fail_on_evaluator_errors: bool
:return: Evaluation results.
:rtype: ~azure.ai.evaluation.EvaluationResult
Expand All @@ -615,6 +622,7 @@ def evaluate(
evaluator_config=evaluator_config,
azure_ai_project=azure_ai_project,
output_path=output_path,
fail_on_evaluator_errors=fail_on_evaluator_errors,
**kwargs,
)
except Exception as e:
Expand Down Expand Up @@ -663,6 +671,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
print("\n====================================================\n")


def _print_fail_flag_warning() -> None:
print(
"Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
+ "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
+ "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
+ "without producing any outputs, since a single failure will cancel the entire run "
"when fail_on_evaluator_errors is enabled."
)


def _evaluate( # pylint: disable=too-many-locals,too-many-statements
*,
evaluators: Dict[str, Callable],
Expand All @@ -672,8 +690,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
azure_ai_project: Optional[AzureAIProject] = None,
output_path: Optional[Union[str, os.PathLike]] = None,
fail_on_evaluator_errors: bool = False,
**kwargs,
) -> EvaluationResult:
if fail_on_evaluator_errors:
_print_fail_flag_warning()
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)

# Process evaluator config to replace ${target.} with ${data.}
Expand Down Expand Up @@ -773,6 +794,10 @@ def eval_batch_run(
evaluators_result_df = None
evaluators_metric = {}
for evaluator_name, evaluator_result in per_evaluator_results.items():
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
_print_summary(per_evaluator_results)
_turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")

evaluator_result_df = evaluator_result["result"]

# drop input columns
Expand Down Expand Up @@ -825,3 +850,20 @@ def eval_batch_run(
_write_output(output_path, result)

return result


def _turn_error_logs_into_exception(log_path: str) -> None:
"""Produce an EvaluationException using the contents of the inputted
file as the error message.
:param log_path: The path to the error log file.
:type log_path: str
"""
with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
error_message = file.read()
raise EvaluationException(
message=error_message,
target=ErrorTarget.EVALUATE,
category=ErrorCategory.FAILED_EXECUTION,
blame=ErrorBlame.UNKNOWN,
)
Original file line number Diff line number Diff line change
@@ -1,30 +1,16 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing import Dict
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from promptflow._utils.async_utils import async_run_allowing_running_loop
from typing_extensions import overload, override

from azure.ai.evaluation._common.utils import nltk_tokenize

from azure.ai.evaluation._evaluators._common import EvaluatorBase

class _AsyncBleuScoreEvaluator:
def __init__(self):
pass

async def __call__(self, *, response: str, ground_truth: str, **kwargs):
reference_tokens = nltk_tokenize(ground_truth)
hypothesis_tokens = nltk_tokenize(response)

# NIST Smoothing
smoothing_function = SmoothingFunction().method4
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)

return {
"bleu_score": score,
}


class BleuScoreEvaluator:
class BleuScoreEvaluator(EvaluatorBase):
"""
Calculate the BLEU score for a given response and ground truth.
Expand All @@ -51,9 +37,32 @@ class BleuScoreEvaluator:
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

def __init__(self):
self._async_evaluator = _AsyncBleuScoreEvaluator()
super().__init__()

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
"""Produce a glue score evaluation result.
:param eval_input: The input to the evaluation function.
:type eval_input: Dict
:return: The evaluation result.
:rtype: Dict
"""
ground_truth = eval_input["ground_truth"]
response = eval_input["response"]
reference_tokens = nltk_tokenize(ground_truth)
hypothesis_tokens = nltk_tokenize(response)

def __call__(self, *, response: str, ground_truth: str, **kwargs):
# NIST Smoothing
smoothing_function = SmoothingFunction().method4
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)

return {
"bleu_score": score,
}

@overload # type: ignore
def __call__(self, *, response: str, ground_truth: str):
"""
Evaluate the BLEU score between the response and the ground truth.
Expand All @@ -64,9 +73,21 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
:return: The BLEU score.
:rtype: Dict[str, float]
"""
return async_run_allowing_running_loop(
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
)

def _to_async(self):
return self._async_evaluator
@override
def __call__( # pylint: disable=docstring-missing-param
self,
*args,
**kwargs,
):
"""
Evaluate the BLEU score between the response and the ground truth.
:keyword response: The response to be evaluated.
:paramtype response: str
:keyword ground_truth: The ground truth to be compared against.
:paramtype ground_truth: str
:return: The BLEU score.
:rtype: Dict[str, float]
"""
return super().__call__(*args, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,44 @@
# ---------------------------------------------------------

from collections import Counter
from typing import List
from typing import List, Dict
from typing_extensions import overload, override

from promptflow._utils.async_utils import async_run_allowing_running_loop
from azure.ai.evaluation._evaluators._common import EvaluatorBase

from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException

class F1ScoreEvaluator(EvaluatorBase):
"""
Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
class _AsyncF1ScoreEvaluator:
def __init__(self):
pass
F1 Scores range from 0 to 1, with 1 being the best possible score.
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
"""
Evaluate F1 score.
The F1-score computes the ratio of the number of shared words between the model generation and
the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
precision is the ratio of the number of shared words to the total number of words in the generation, and recall
is the ratio of the number of shared words to the total number of words in the ground truth.
:keyword response: The response to be evaluated.
:paramtype response: str
:keyword ground_truth: The ground truth to be evaluated.
:paramtype ground_truth: str
:return: The F1 score.
:rtype: Dict[str, float]
"""
# Validate inputs
if not (response and response.strip() and response != "None") or not (
ground_truth and ground_truth.strip() and ground_truth != "None"
):
msg = "Both 'response' and 'ground_truth' must be non-empty strings."
raise EvaluationException(
message=msg,
internal_message=msg,
error_category=ErrorCategory.MISSING_FIELD,
error_blame=ErrorBlame.USER_ERROR,
error_target=ErrorTarget.F1_EVALUATOR,
)
Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
information in the response.
# Run f1 score computation.
f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
return {"f1_score": f1_result}
.. admonition:: Example:
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START f1_score_evaluator]
:end-before: [END f1_score_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call an F1ScoreEvaluator.
"""

id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

def __init__(self):
super().__init__()

@classmethod
def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
Expand Down Expand Up @@ -103,41 +102,24 @@ def lower(text):

return f1

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
"""Produce an f1 score evaluation result.
class F1ScoreEvaluator:
"""
Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
F1 Scores range from 0 to 1, with 1 being the best possible score.
The F1-score computes the ratio of the number of shared words between the model generation and
the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
precision is the ratio of the number of shared words to the total number of words in the generation, and recall
is the ratio of the number of shared words to the total number of words in the ground truth.
Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
information in the response.
.. admonition:: Example:
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START f1_score_evaluator]
:end-before: [END f1_score_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call an F1ScoreEvaluator.
"""

id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
:param eval_input: The input to the evaluation function.
:type eval_input: Dict
:return: The evaluation result.
:rtype: Dict
"""
ground_truth = eval_input["ground_truth"]
response = eval_input["response"]
# Run f1 score computation.
f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)

def __init__(self):
self._async_evaluator = _AsyncF1ScoreEvaluator()
return {"f1_score": f1_result}

def __call__(self, *, response: str, ground_truth: str, **kwargs):
@overload # type: ignore
def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
"""
Evaluate F1 score.
Expand All @@ -149,9 +131,20 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
:rtype: Dict[str, float]
"""

return async_run_allowing_running_loop(
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
)
@override
def __call__( # pylint: disable=docstring-missing-param
self,
*args,
**kwargs,
):
"""
Evaluate F1 score.
def _to_async(self):
return self._async_evaluator
:keyword response: The response to be evaluated.
:paramtype response: str
:keyword ground_truth: The ground truth to be evaluated.
:paramtype ground_truth: str
:return: The F1 score.
:rtype: Dict[str, float]
"""
return super().__call__(*args, **kwargs)
Loading

0 comments on commit 8bff427

Please sign in to comment.