Skip to content

Commit

Permalink
[pf-evals] Convert content safety and f1 score to async-enabled imple…
Browse files Browse the repository at this point in the history
…mentation (#3574)

# Description

Please add an informative description that covers that changes made by
the pull request and link all relevant issues.

# All Promptflow Contribution checklist:
- [ ] **The pull request does not introduce [breaking changes].**
- [ ] **CHANGELOG is updated for new features, bug fixes or other
significant changes.**
- [ ] **I have read the [contribution
guidelines](https://github.com/microsoft/promptflow/blob/main/CONTRIBUTING.md).**
- [ ] **I confirm that all new dependencies are compatible with the MIT
license.**
- [ ] **Create an issue and link to the pull request to get dedicated
review from promptflow team. Learn more: [suggested
workflow](../CONTRIBUTING.md#suggested-workflow).**

## General Guidelines and Best Practices
- [ ] Title of the pull request is clear and informative.
- [ ] There are a small number of commits, each of which have an
informative message. This means that previously merged commits do not
appear in the history of the PR. For more information on cleaning up the
commits in your PR, [see this
page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).

### Testing Guidelines
- [ ] Pull request includes test coverage for the included changes.
  • Loading branch information
ninghu authored Jul 25, 2024
1 parent fda05ab commit c1f0f6d
Show file tree
Hide file tree
Showing 39 changed files with 131,553 additions and 12,820 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/promptflow-evals-e2e-test-local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ jobs:
- name: generate end-to-end test config from secret
run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
working-directory: ${{ env.WORKING_DIRECTORY }}
- uses: azure/login@v2
with:
creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }}
enable-AzPSSession: true
- name: check azure is not installed
run: poetry run pytest ../../scripts/code_qa/assert_local_install.py
working-directory: ${{ env.WORKING_DIRECTORY }}
Expand Down
53 changes: 0 additions & 53 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,56 +44,3 @@ repos:
# Use black profile for isort to avoid conflicts
# see https://github.com/PyCQA/isort/issues/1518
args: ["--profile", "black", --line-length=120]
- repo: local
hooks:
- id: pylint
name: pylint
entry: python
language: system
args: [ -m, pylint, --rcfile=pylintrc, --output-format=parseable ]
types: [python]
- repo: local
hooks:
- id: pylint-dependencies-check
name: pylint-dependencies-check
entry: python
language: system
types: [python]
args:
- "-c"
- |
import os
import sys
import pkg_resources
# These are the versions that run in our CI
dependencies = [
(
"azure-pylint-guidelines-checker",
"0.3.1",
[
"--index-url",
"https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/",
],
),
("pylint", "3.0.3", []),
]
# Make sure that correct versions are installed
for packagename, required_version, install_args in dependencies:
try:
version = pkg_resources.get_distribution(packagename).version
assert version == required_version
except AssertionError:
print(
f"Version mismatch: Installed version '{version}' of '{packagename}' does not match required version {required_version}"
)
except pkg_resources.DistributionNotFound:
print(f"Package '{packagename}' is not installed")
else:
continue
print(f"Please run the following command to install the correct version of {packagename}")
print(f"\tpython -m pip install {packagename}=={required_version} {' '.join(install_args)}")
sys.exit(1)
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from _sexual import SexualEvaluator
from _violence import ViolenceEvaluator


logger = logging.getLogger(__name__)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,31 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from promptflow._utils.async_utils import async_run_allowing_running_loop

try:
from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
from .common import ContentSafetyEvaluatorBase
from .common.constants import EvaluationMetrics
except ImportError:
from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
from common import ContentSafetyEvaluatorBase
from common.constants import EvaluationMetrics


class HateUnfairnessEvaluator(ContentSafetySubEvaluatorBase):
class _AsyncHateUnfairnessEvaluator(ContentSafetyEvaluatorBase):
def __init__(self, project_scope: dict, credential=None):
# Hate_fairness is the actual backend metric name. Which, uh, doesn't sound great.
# so invert the name.
super().__init__(
metric=EvaluationMetrics.HATE_FAIRNESS,
project_scope=project_scope,
credential=credential,
)

async def __call__(self, *, question: str, answer: str, **kwargs):
return await super().__call__(question=question, answer=answer, **kwargs)


class HateUnfairnessEvaluator:
"""
Initialize a hate-unfairness evaluator for hate unfairness score.
Expand Down Expand Up @@ -44,10 +59,20 @@ class HateUnfairnessEvaluator(ContentSafetySubEvaluatorBase):
"""

def __init__(self, project_scope: dict, credential=None) -> None:
# Hate_fairness is the actual backend metric name. Which, uh, doesn't sound great.
# so invert the name.
super().__init__(
metric=EvaluationMetrics.HATE_FAIRNESS,
project_scope=project_scope,
credential=credential,
)
self._async_evaluator = _AsyncHateUnfairnessEvaluator(project_scope, credential)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates hate unfairness content.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The hate unfairness score.
:rtype: dict
"""
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)

def _to_async(self):
return self._async_evaluator
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from promptflow._utils.async_utils import async_run_allowing_running_loop

try:
from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
from .common import ContentSafetyEvaluatorBase
from .common.constants import EvaluationMetrics
except ImportError:
from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
from common import ContentSafetyEvaluatorBase
from common.constants import EvaluationMetrics


class SelfHarmEvaluator(ContentSafetySubEvaluatorBase):
class _AsyncSelfHarmEvaluator(ContentSafetyEvaluatorBase):
def __init__(self, project_scope: dict, credential=None):
super().__init__(
metric=EvaluationMetrics.SELF_HARM,
project_scope=project_scope,
credential=credential,
)

async def __call__(self, *, question: str, answer: str, **kwargs):
return await super().__call__(question=question, answer=answer, **kwargs)


class SelfHarmEvaluator:
"""
Initialize a self harm evaluator for self harm score.
Expand Down Expand Up @@ -43,8 +57,20 @@ class SelfHarmEvaluator(ContentSafetySubEvaluatorBase):
"""

def __init__(self, project_scope: dict, credential=None):
super().__init__(
metric=EvaluationMetrics.SELF_HARM,
project_scope=project_scope,
credential=credential,
)
self._async_evaluator = _AsyncSelfHarmEvaluator(project_scope, credential)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates self harm content.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The self harm score.
:rtype: dict
"""
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)

def _to_async(self):
return self._async_evaluator
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from promptflow._utils.async_utils import async_run_allowing_running_loop

try:
from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
from .common import ContentSafetyEvaluatorBase
from .common.constants import EvaluationMetrics
except ImportError:
from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
from common import ContentSafetyEvaluatorBase
from common.constants import EvaluationMetrics


class SexualEvaluator(ContentSafetySubEvaluatorBase):
class _AsyncSexualEvaluator(ContentSafetyEvaluatorBase):
def __init__(self, project_scope: dict, credential=None):
super().__init__(
metric=EvaluationMetrics.SEXUAL,
project_scope=project_scope,
credential=credential,
)

async def __call__(self, *, question: str, answer: str, **kwargs):
return await super().__call__(question=question, answer=answer, **kwargs)


class SexualEvaluator:
"""
Initialize a sexual evaluator for sexual score.
Expand Down Expand Up @@ -43,8 +57,20 @@ class SexualEvaluator(ContentSafetySubEvaluatorBase):
"""

def __init__(self, project_scope: dict, credential=None):
super().__init__(
metric=EvaluationMetrics.SEXUAL,
project_scope=project_scope,
credential=credential,
)
self._async_evaluator = _AsyncSexualEvaluator(project_scope, credential)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates sexual content.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The sexual score.
:rtype: dict
"""
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)

def _to_async(self):
return self._async_evaluator
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from promptflow._utils.async_utils import async_run_allowing_running_loop

try:
from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
from .common import ContentSafetyEvaluatorBase
from .common.constants import EvaluationMetrics
except ImportError:
from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
from common import ContentSafetyEvaluatorBase
from common.constants import EvaluationMetrics


class ViolenceEvaluator(ContentSafetySubEvaluatorBase):
class _AsyncViolenceEvaluator(ContentSafetyEvaluatorBase):
def __init__(self, project_scope: dict, credential=None):
super().__init__(
metric=EvaluationMetrics.VIOLENCE,
project_scope=project_scope,
credential=credential,
)

async def __call__(self, *, question: str, answer: str, **kwargs):
return await super().__call__(question=question, answer=answer, **kwargs)


class ViolenceEvaluator:
"""
Initialize a violence evaluator for violence score.
Expand Down Expand Up @@ -43,8 +57,20 @@ class ViolenceEvaluator(ContentSafetySubEvaluatorBase):
"""

def __init__(self, project_scope: dict, credential=None):
super().__init__(
metric=EvaluationMetrics.VIOLENCE,
project_scope=project_scope,
credential=credential,
)
self._async_evaluator = _AsyncViolenceEvaluator(project_scope, credential)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates violence content.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The violence score.
:rtype: dict
"""
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)

def _to_async(self):
return self._async_evaluator
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from . import constants, evaluate_with_rai_service, validate_inputs, utils
from .content_safety_base import ContentSafetyEvaluatorBase

__all__ = [
"constants",
"evaluate_with_rai_service",
"validate_inputs",
"utils",
]
__all__ = ["ContentSafetyEvaluatorBase"]
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from abc import ABC

try:
from .common.constants import EvaluationMetrics
from .common.evaluate_with_rai_service import evaluate_with_rai_service
from .common.validate_inputs import validate_inputs
from .constants import EvaluationMetrics
from .evaluate_with_rai_service import evaluate_with_rai_service
except ImportError:
from common.constants import EvaluationMetrics
from common.evaluate_with_rai_service import evaluate_with_rai_service
from common.validate_inputs import validate_inputs
from constants import EvaluationMetrics
from evaluate_with_rai_service import evaluate_with_rai_service


class ContentSafetySubEvaluatorBase(ABC):
class ContentSafetyEvaluatorBase(ABC):
"""
Initialize a evaluator for a specified Evaluation Metric. Base class that is not
meant to be instantiated by users.
Expand All @@ -33,7 +32,7 @@ def __init__(self, metric: EvaluationMetrics, project_scope: dict, credential=No
self._project_scope = project_scope
self._credential = credential

def __call__(self, *, question: str, answer: str, **kwargs):
async def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates content according to this evaluator's metric.
Expand All @@ -46,10 +45,13 @@ def __call__(self, *, question: str, answer: str, **kwargs):
"""
# Validate inputs
# Raises value error if failed, so execution alone signifies success.
_ = validate_inputs(question=question, answer=answer)
if not (question and question.strip() and question != "None") or not (
answer and answer.strip() and answer != "None"
):
raise ValueError("Both 'question' and 'answer' must be non-empty strings.")

# Run score computation based on supplied metric.
result = evaluate_with_rai_service(
result = await evaluate_with_rai_service(
metric_name=self._metric,
question=question,
answer=answer,
Expand Down
Loading

0 comments on commit c1f0f6d

Please sign in to comment.