Skip to content

Commit

Permalink
Change eci prefix from upper case to lowercase (#3771)
Browse files Browse the repository at this point in the history
# Description

This PR changes ECI case from upper to lower to match other evaluator
type casing and integrate better with UI evaluation flows.

# All Promptflow Contribution checklist:
- [ ] **The pull request does not introduce [breaking changes].**
- [ ] **CHANGELOG is updated for new features, bug fixes or other
significant changes.**
- [ ] **I have read the [contribution
guidelines](https://github.com/microsoft/promptflow/blob/main/CONTRIBUTING.md).**
- [ ] **I confirm that all new dependencies are compatible with the MIT
license.**
- [ ] **Create an issue and link to the pull request to get dedicated
review from promptflow team. Learn more: [suggested
workflow](../CONTRIBUTING.md#suggested-workflow).**

## General Guidelines and Best Practices
- [ ] Title of the pull request is clear and informative.
- [ ] There are a small number of commits, each of which have an
informative message. This means that previously merged commits do not
appear in the history of the PR. For more information on cleaning up the
commits in your PR, [see this
page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).

### Testing Guidelines
- [ ] Pull request includes test coverage for the included changes.
  • Loading branch information
diondrapeck authored Sep 18, 2024
1 parent e79da2e commit 3cde352
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 27 deletions.
24 changes: 5 additions & 19 deletions src/promptflow-evals/promptflow/evals/_common/rai_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,42 +220,28 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
response = response.replace("true", "True")
parsed_response = literal_eval(response)
result = {}
metric_prefix = _get_metric_prefix(metric_name)
# Use label instead of score since these are assumed to be boolean results.
# Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""

if metric_name == EvaluationMetrics.XPIA:
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
# if present else set them to np.nan
result[metric_prefix + "_manipulated_content"] = (
result[metric_name + "_manipulated_content"] = (
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
)
result[metric_prefix + "_intrusion"] = (
result[metric_name + "_intrusion"] = (
parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
)
result[metric_prefix + "_information_gathering"] = (
result[metric_name + "_information_gathering"] = (
parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
)

return result
return _parse_content_harm_response(batch_response, metric_name)


def _get_metric_prefix(metric_name: str) -> str:
"""Get the prefix for the evaluation metric. This is usually the metric name.
:param metric_name: The evaluation metric to use.
:type metric_name: str
:return: The prefix for the evaluation metric.
:rtype: str
"""
if metric_name == _InternalEvaluationMetrics.ECI:
return "ECI"
return metric_name


def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
Expand Down
4 changes: 2 additions & 2 deletions src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ class ECIEvaluator:
.. code-block:: python
{
"ECI_label": "False",
"ECI_reason": "Some reason."
"eci_label": "False",
"eci_reason": "Some reason."
}
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,8 @@ def test_eci_evaluator(self, project_scope, azure_cred):
answer="Rhombus",
)
assert unrelated_result is not None
assert not unrelated_result["ECI_label"]
assert "geometry question" in unrelated_result["ECI_reason"]
assert not unrelated_result["eci_label"]
assert "geometry question" in unrelated_result["eci_reason"]

# @pytest.mark.skipif(
# not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT."
Expand Down
8 changes: 4 additions & 4 deletions src/promptflow-evals/tests/evals/unittests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,8 +463,8 @@ def test_content_safety_aggregation(self):

def test_label_based_aggregation(self):
data = {
"eci.ECI_label": [True, False, True, False, True],
"eci.ECI_reasoning": ["a", "b", "c", "d", "e"],
"eci.eci_label": [True, False, True, False, True],
"eci.eci_reasoning": ["a", "b", "c", "d", "e"],
"protected_material.protected_material_label": [False, False, False, False, True],
"protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"],
"unknown.unaccounted_label": [True, False, False, False, True],
Expand All @@ -478,11 +478,11 @@ def test_label_based_aggregation(self):
aggregation = _aggregate_metrics(data_df, evaluators)
# ECI and PM labels should be replaced with defect rates, unaccounted should not
assert len(aggregation) == 3
assert "eci.ECI_label" not in aggregation
assert "eci.eci_label" not in aggregation
assert "protected_material.protected_material_label" not in aggregation
assert aggregation["unknown.unaccounted_label"] == 0.4

assert aggregation["eci.ECI_defect_rate"] == 0.6
assert aggregation["eci.eci_defect_rate"] == 0.6
assert aggregation["protected_material.protected_material_defect_rate"] == 0.2
assert "unaccounted_defect_rate" not in aggregation

Expand Down

0 comments on commit 3cde352

Please sign in to comment.