[pf-evals] Convert content safety and f1 score to async-enabled imple…

…mentation (#3574) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](https://github.com/microsoft/promptflow/blob/main/CONTRIBUTING.md).** - [ ] **I confirm that all new dependencies are compatible with the MIT license.** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
microsoft · Jul 25, 2024 · c1f0f6d · c1f0f6d
1 parent fda05ab
commit c1f0f6d
Show file tree

Hide file tree

Showing 39 changed files with 131,553 additions and 12,820 deletions.
diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml
@@ -53,6 +53,10 @@ jobs:
       - name: generate end-to-end test config from secret
         run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
         working-directory: ${{ env.WORKING_DIRECTORY }}
+      - uses: azure/login@v2
+        with:
+          creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }}
+          enable-AzPSSession: true
       - name: check azure is not installed
         run: poetry run pytest ../../scripts/code_qa/assert_local_install.py
         working-directory: ${{ env.WORKING_DIRECTORY }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,56 +44,3 @@ repos:
           # Use black profile for isort to avoid conflicts
           # see https://github.com/PyCQA/isort/issues/1518
           args: ["--profile", "black", --line-length=120]
-    - repo: local
-      hooks:
-        - id: pylint
-          name: pylint
-          entry: python
-          language: system
-          args: [ -m, pylint, --rcfile=pylintrc, --output-format=parseable ]
-          types: [python]
-    - repo: local
-      hooks:
-      - id: pylint-dependencies-check
-        name: pylint-dependencies-check
-        entry: python
-        language: system
-        types: [python]
-        args:
-        - "-c"
-        - |
-            import os
-            import sys
-
-            import pkg_resources
-
-            # These are the versions that run in our CI
-            dependencies = [
-                (
-                    "azure-pylint-guidelines-checker",
-                    "0.3.1",
-                    [
-                        "--index-url",
-                        "https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/",
-                    ],
-                ),
-                ("pylint", "3.0.3", []),
-            ]
-
-            # Make sure that correct versions are installed
-            for packagename, required_version, install_args in dependencies:
-                try:
-                    version = pkg_resources.get_distribution(packagename).version
-                    assert version == required_version
-                except AssertionError:
-                    print(
-                        f"Version mismatch: Installed version '{version}' of '{packagename}' does not match required version {required_version}"
-                    )
-                except pkg_resources.DistributionNotFound:
-                    print(f"Package '{packagename}' is not installed")
-                else:
-                    continue
-
-                print(f"Please run the following command to install the correct version of {packagename}")
-                print(f"\tpython -m pip install {packagename}=={required_version} {' '.join(install_args)}")
-                sys.exit(1)
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_chat.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_chat.py
@@ -18,7 +18,6 @@
     from _sexual import SexualEvaluator
     from _violence import ViolenceEvaluator
 
-
 logger = logging.getLogger(__name__)
 
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_hate_unfairness.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_hate_unfairness.py
@@ -1,16 +1,31 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from promptflow._utils.async_utils import async_run_allowing_running_loop
 
 try:
-    from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
+    from .common import ContentSafetyEvaluatorBase
     from .common.constants import EvaluationMetrics
 except ImportError:
-    from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
+    from common import ContentSafetyEvaluatorBase
     from common.constants import EvaluationMetrics
 
 
-class HateUnfairnessEvaluator(ContentSafetySubEvaluatorBase):
+class _AsyncHateUnfairnessEvaluator(ContentSafetyEvaluatorBase):
+    def __init__(self, project_scope: dict, credential=None):
+        # Hate_fairness is the actual backend metric name. Which, uh, doesn't sound great.
+        # so invert the name.
+        super().__init__(
+            metric=EvaluationMetrics.HATE_FAIRNESS,
+            project_scope=project_scope,
+            credential=credential,
+        )
+
+    async def __call__(self, *, question: str, answer: str, **kwargs):
+        return await super().__call__(question=question, answer=answer, **kwargs)
+
+
+class HateUnfairnessEvaluator:
     """
     Initialize a hate-unfairness evaluator for hate unfairness score.
 
@@ -44,10 +59,20 @@ class HateUnfairnessEvaluator(ContentSafetySubEvaluatorBase):
     """
 
     def __init__(self, project_scope: dict, credential=None) -> None:
-        # Hate_fairness is the actual backend metric name. Which, uh, doesn't sound great.
-        # so invert the name.
-        super().__init__(
-            metric=EvaluationMetrics.HATE_FAIRNESS,
-            project_scope=project_scope,
-            credential=credential,
-        )
+        self._async_evaluator = _AsyncHateUnfairnessEvaluator(project_scope, credential)
+
+    def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates hate unfairness content.
+
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :return: The hate unfairness score.
+        :rtype: dict
+        """
+        return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
+
+    def _to_async(self):
+        return self._async_evaluator
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_self_harm.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_self_harm.py
@@ -1,15 +1,29 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from promptflow._utils.async_utils import async_run_allowing_running_loop
+
 try:
-    from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
+    from .common import ContentSafetyEvaluatorBase
     from .common.constants import EvaluationMetrics
 except ImportError:
-    from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
+    from common import ContentSafetyEvaluatorBase
     from common.constants import EvaluationMetrics
 
 
-class SelfHarmEvaluator(ContentSafetySubEvaluatorBase):
+class _AsyncSelfHarmEvaluator(ContentSafetyEvaluatorBase):
+    def __init__(self, project_scope: dict, credential=None):
+        super().__init__(
+            metric=EvaluationMetrics.SELF_HARM,
+            project_scope=project_scope,
+            credential=credential,
+        )
+
+    async def __call__(self, *, question: str, answer: str, **kwargs):
+        return await super().__call__(question=question, answer=answer, **kwargs)
+
+
+class SelfHarmEvaluator:
     """
     Initialize a self harm evaluator for self harm score.
 
@@ -43,8 +57,20 @@ class SelfHarmEvaluator(ContentSafetySubEvaluatorBase):
     """
 
     def __init__(self, project_scope: dict, credential=None):
-        super().__init__(
-            metric=EvaluationMetrics.SELF_HARM,
-            project_scope=project_scope,
-            credential=credential,
-        )
+        self._async_evaluator = _AsyncSelfHarmEvaluator(project_scope, credential)
+
+    def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates self harm content.
+
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :return: The self harm score.
+        :rtype: dict
+        """
+        return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
+
+    def _to_async(self):
+        return self._async_evaluator
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_sexual.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_sexual.py
@@ -1,15 +1,29 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from promptflow._utils.async_utils import async_run_allowing_running_loop
+
 try:
-    from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
+    from .common import ContentSafetyEvaluatorBase
     from .common.constants import EvaluationMetrics
 except ImportError:
-    from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
+    from common import ContentSafetyEvaluatorBase
     from common.constants import EvaluationMetrics
 
 
-class SexualEvaluator(ContentSafetySubEvaluatorBase):
+class _AsyncSexualEvaluator(ContentSafetyEvaluatorBase):
+    def __init__(self, project_scope: dict, credential=None):
+        super().__init__(
+            metric=EvaluationMetrics.SEXUAL,
+            project_scope=project_scope,
+            credential=credential,
+        )
+
+    async def __call__(self, *, question: str, answer: str, **kwargs):
+        return await super().__call__(question=question, answer=answer, **kwargs)
+
+
+class SexualEvaluator:
     """
     Initialize a sexual evaluator for sexual score.
 
@@ -43,8 +57,20 @@ class SexualEvaluator(ContentSafetySubEvaluatorBase):
     """
 
     def __init__(self, project_scope: dict, credential=None):
-        super().__init__(
-            metric=EvaluationMetrics.SEXUAL,
-            project_scope=project_scope,
-            credential=credential,
-        )
+        self._async_evaluator = _AsyncSexualEvaluator(project_scope, credential)
+
+    def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates sexual content.
+
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :return: The sexual score.
+        :rtype: dict
+        """
+        return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
+
+    def _to_async(self):
+        return self._async_evaluator
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_violence.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_violence.py
@@ -1,15 +1,29 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from promptflow._utils.async_utils import async_run_allowing_running_loop
+
 try:
-    from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
+    from .common import ContentSafetyEvaluatorBase
     from .common.constants import EvaluationMetrics
 except ImportError:
-    from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase
+    from common import ContentSafetyEvaluatorBase
     from common.constants import EvaluationMetrics
 
 
-class ViolenceEvaluator(ContentSafetySubEvaluatorBase):
+class _AsyncViolenceEvaluator(ContentSafetyEvaluatorBase):
+    def __init__(self, project_scope: dict, credential=None):
+        super().__init__(
+            metric=EvaluationMetrics.VIOLENCE,
+            project_scope=project_scope,
+            credential=credential,
+        )
+
+    async def __call__(self, *, question: str, answer: str, **kwargs):
+        return await super().__call__(question=question, answer=answer, **kwargs)
+
+
+class ViolenceEvaluator:
     """
     Initialize a violence evaluator for violence score.
 
@@ -43,8 +57,20 @@ class ViolenceEvaluator(ContentSafetySubEvaluatorBase):
     """
 
     def __init__(self, project_scope: dict, credential=None):
-        super().__init__(
-            metric=EvaluationMetrics.VIOLENCE,
-            project_scope=project_scope,
-            credential=credential,
-        )
+        self._async_evaluator = _AsyncViolenceEvaluator(project_scope, credential)
+
+    def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates violence content.
+
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :return: The violence score.
+        :rtype: dict
+        """
+        return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
+
+    def _to_async(self):
+        return self._async_evaluator
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/__init__.py
@@ -2,11 +2,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from . import constants, evaluate_with_rai_service, validate_inputs, utils
+from .content_safety_base import ContentSafetyEvaluatorBase
 
-__all__ = [
-    "constants",
-    "evaluate_with_rai_service",
-    "validate_inputs",
-    "utils",
-]
+__all__ = ["ContentSafetyEvaluatorBase"]
diff --git a/...ety/_content_safety_sub_evaluator_base.py → ...tent_safety/common/content_safety_base.py b/...ety/_content_safety_sub_evaluator_base.py → ...tent_safety/common/content_safety_base.py
@@ -1,19 +1,18 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+
 from abc import ABC
 
 try:
-    from .common.constants import EvaluationMetrics
-    from .common.evaluate_with_rai_service import evaluate_with_rai_service
-    from .common.validate_inputs import validate_inputs
+    from .constants import EvaluationMetrics
+    from .evaluate_with_rai_service import evaluate_with_rai_service
 except ImportError:
-    from common.constants import EvaluationMetrics
-    from common.evaluate_with_rai_service import evaluate_with_rai_service
-    from common.validate_inputs import validate_inputs
+    from constants import EvaluationMetrics
+    from evaluate_with_rai_service import evaluate_with_rai_service
 
 
-class ContentSafetySubEvaluatorBase(ABC):
+class ContentSafetyEvaluatorBase(ABC):
     """
     Initialize a evaluator for a specified Evaluation Metric. Base class that is not
     meant to be instantiated by users.
@@ -33,7 +32,7 @@ def __init__(self, metric: EvaluationMetrics, project_scope: dict, credential=No
         self._project_scope = project_scope
         self._credential = credential
 
-    def __call__(self, *, question: str, answer: str, **kwargs):
+    async def __call__(self, *, question: str, answer: str, **kwargs):
         """
         Evaluates content according to this evaluator's metric.
 
@@ -46,10 +45,13 @@ def __call__(self, *, question: str, answer: str, **kwargs):
         """
         # Validate inputs
         # Raises value error if failed, so execution alone signifies success.
-        _ = validate_inputs(question=question, answer=answer)
+        if not (question and question.strip() and question != "None") or not (
+            answer and answer.strip() and answer != "None"
+        ):
+            raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
 
         # Run score computation based on supplied metric.
-        result = evaluate_with_rai_service(
+        result = await evaluate_with_rai_service(
             metric_name=self._metric,
             question=question,
             answer=answer,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,7 +18,6 @@
		from _sexual import SexualEvaluator
		from _violence import ViolenceEvaluator


		logger = logging.getLogger(__name__)


Expand Down