Flake8 error fixed

microsoft · Apr 1, 2024 · 001b91d · 001b91d
1 parent 5919f7b
commit 001b91d
Show file tree

Hide file tree

Showing 32 changed files with 138 additions and 98 deletions.
diff --git a/src/promptflow-evals/promptflow/evals/__init__.py b/src/promptflow-evals/promptflow/evals/__init__.py
@@ -9,4 +9,3 @@
 __all__ = [
     "evaluate"
 ]
-
diff --git a/src/promptflow-evals/promptflow/evals/_constants.py b/src/promptflow-evals/promptflow/evals/_constants.py
@@ -9,4 +9,4 @@ class EvaluationMetrics:
     HATE_FAIRNESS = "hate_fairness"
     VIOLENCE = "violence"
     SELF_HARM = "self_harm"
-    SEXUAL = "sexual"
+    SEXUAL = "sexual"
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
@@ -1,15 +1,8 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import json
-from json import JSONDecodeError
-from pathlib import Path
-from typing import Optional, Dict, Union, Callable
-
-import numpy as np
+from typing import Optional, Dict, Callable
 import pandas as pd
-
-from ._utils import load_jsonl
 from ._flow_run_wrapper import FlowRunWrapper
 from promptflow import PFClient
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_flow_run_wrapper.py b/src/promptflow-evals/promptflow/evals/evaluate/_flow_run_wrapper.py
@@ -19,7 +19,10 @@ def get_result_df(self, all_results=True, exclude_inputs=False):
             result_df = result_df.drop(
                 columns=[col for col in result_df.columns if col.startswith("inputs.")]
             )
-        result_df.rename(columns={col: col.replace("outputs", self.prefix) for col in [col for col in result_df.columns if col.startswith("outputs.")]}, inplace=True)
+        result_df.rename(
+            columns={col: col.replace("outputs", self.prefix)
+                     for col in [col for col in result_df.columns if col.startswith("outputs.")]},
+            inplace=True)
         return result_df
 
     def _wait_for_completion(self):

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
@@ -12,7 +12,6 @@
 import logging
 import numpy as np
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -30,9 +29,11 @@ def __init__(
         :type model_config: AzureOpenAIConnection
         :param deployment_name: Deployment to be used which has Azure OpenAI model.
         :type deployment_name: AzureOpenAIConnection
-        :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
+        :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
+            focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
         :type eval_last_turn: bool
-        :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution. Default is True.
+        :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
+            Default is True.
         :type parallel: bool
         :return: A function that evaluates and generates metrics for "chat" scenario.
         :rtype: function
@@ -45,15 +46,19 @@ def __init__(
             conversation = [
                 {"role": "user", "content": "What is the value of 2 + 2?"},
                 {"role": "assistant", "content": "2 + 2 = 4", "context": {
-                    "citations": [{"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]}}
+                    "citations": [
+                            {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
+                            ]
+                    }
+                }
             ]
             result = chat_eval(conversation=conversation)
         """
         self._eval_last_turn = eval_last_turn
         self._parallel = parallel
 
         # TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
-        self._rag_evaluators  = [
+        self._rag_evaluators = [
             GroundednessEvaluator(model_config, deployment_name=deployment_name),
             RelevanceEvaluator(model_config, deployment_name=deployment_name),
         ]
@@ -66,7 +71,7 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
         """Evaluates chat scenario.
 
         :param conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
-                             "context" key is optional for assistant's turn and should have "citations" key with list of citations.
+            "context" key is optional for assistant's turn and should have "citations" key with list of citations.
         :type conversation: List[Dict]
         :return: The scores for Chat scenario.
         :rtype: dict
@@ -99,7 +104,7 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
         compute_rag_based_metrics = True
         if len(answers) != len(contexts):
             safe_message = "Skipping rag based metrics as we need citations or " \
-                            "retrieved_documents in context key of every assistant's turn"
+                           "retrieved_documents in context key of every assistant's turn"
             logger.warning(safe_message)
             compute_rag_based_metrics = False
 
@@ -117,7 +122,8 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
                 # Parallel execution
                 with ThreadPoolExecutor() as executor:
                     future_to_evaluator = {
-                        executor.submit(self._evaluate_turn, turn_num, questions, answers, contexts, evaluator): evaluator
+                        executor.submit(self._evaluate_turn, turn_num, questions, answers, contexts, evaluator)
+                        : evaluator
                         for evaluator in selected_evaluators
                     }
 
@@ -160,7 +166,8 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
 
             return score
         except Exception as e:
-            logger.warning(f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
+            logger.warning(
+                f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
             return {}
 
     def _validate_conversation(self, conversation: List[Dict]):
@@ -175,27 +182,35 @@ def _validate_conversation(self, conversation: List[Dict]):
                 raise ValueError(f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}")
 
             if "role" not in turn or "content" not in turn:
-                raise ValueError(f"Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: {one_based_turn_num}")
+                raise ValueError(
+                    f"Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: "
+                    f"{one_based_turn_num}")
 
             if turn["role"] != expected_role:
-                raise ValueError(f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}")
+                raise ValueError(
+                    f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}")
 
             if not isinstance(turn["content"], str):
                 raise ValueError(f"Content in each turn must be a string. Turn number: {one_based_turn_num}")
 
             if turn["role"] == "assistant" and "context" in turn:
                 if not isinstance(turn["context"], dict):
-                    raise ValueError(f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}")
+                    raise ValueError(
+                        f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}")
 
                 if "citations" not in turn["context"]:
-                    raise ValueError(f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}")
+                    raise ValueError(
+                        f"Context in each assistant's turn must have 'citations' key. Turn number:"
+                        f" {one_based_turn_num}")
 
                 if not isinstance(turn["context"]["citations"], list):
                     raise ValueError(f"'citations' in context must be a list. Turn number: {one_based_turn_num}")
 
                 for citation_num, citation in enumerate(turn["context"]["citations"]):
                     if not isinstance(citation, dict):
-                        raise ValueError(f"Each citation in 'citations' must be a dictionary. Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}")
+                        raise ValueError(
+                            f"Each citation in 'citations' must be a dictionary. Turn number: {one_based_turn_num},"
+                            f" Citation number: {citation_num + 1}")
 
             # Toggle expected role for the next turn
             expected_role = "user" if expected_role == "assistant" else "assistant"

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py
@@ -58,4 +58,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
         """
 
         # Run the evaluation flow
-        return self._flow(question=question, answer=answer)
+        return self._flow(question=question, answer=answer)
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/validate_inputs.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/validate_inputs.py
@@ -1,9 +1,10 @@
 from promptflow import tool
 
+
 @tool
 def validate_inputs(question: str, answer: str):
     # Validate input parameters
     if not (question and question.strip()) or not (answer and answer.strip()):
         raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
 
-    return True
+    return True
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/__init__.py
@@ -4,13 +4,11 @@
 
 __path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
 
-
 from .violence import ViolenceEvaluator
 from .sexual import SexualEvaluator
-from.self_harm import SelfHarmEvaluator
+from .self_harm import SelfHarmEvaluator
 from .hate_unfairness import HateUnfairnessEvaluator
 
-
 __all__ = [
     "ViolenceEvaluator",
     "SexualEvaluator",

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/constants.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/constants.py
@@ -1,17 +1,20 @@
 from enum import Enum
 
+
 class RAIService:
     """Define constants related to RAI service"""
     TIMEOUT = 1800
     SLEEP_TIME = 2
     HARM_SEVERITY_THRESHOLD = 4
 
+
 class HarmSeverityLevel(Enum):
     Safe = 0
     Low = 1
     Medium = 2
     High = 3
 
+
 class Tasks:
     """Defines types of annotation tasks supported by RAI Service."""
-    CONTENT_HARM = "content harm"
+    CONTENT_HARM = "content harm"
diff --git a/...ptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py b/...ptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py
@@ -9,12 +9,14 @@
 from azure.identity import DefaultAzureCredential
 import requests
 
+
 def ensure_service_availability(rai_svc_url: str):
     svc_liveness_url = rai_svc_url.split("/subscriptions")[0] + "/meta/version"
     response = requests.get(svc_liveness_url)
     if response.status_code != 200:
         raise Exception("RAI service is not available in this region")
 
+
 def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, credential: TokenCredential):
     user_text = f"<Human>{question}</><System>{answer}</>"
     normalized_user_text = user_text.replace("'", "\\\"")
@@ -33,13 +35,14 @@ def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, cr
 
     response = requests.post(url, json=payload, headers=headers)
     if response.status_code != 202:
-        print("Fail evaluating '%s' with error message: %s" %(payload["UserTextList"], response.text))
+        print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
         response.raise_for_status()
 
     result = response.json()
     operation_id = result["location"].split("/")[-1]
     return operation_id
 
+
 def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential):
     start = time.time()
     request_count = 0
@@ -64,6 +67,7 @@ def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredentia
         sleep_time = RAIService.SLEEP_TIME ** request_count
         time.sleep(sleep_time)
 
+
 def parse_response(batch_response: List[dict], metric_name: str) -> List[List[dict]]:
     # Fix the metric name if it's "hate_fairness"
     # Eventually we will remove this fix once the RAI service is updated
@@ -92,11 +96,11 @@ def parse_response(batch_response: List[dict], metric_name: str) -> List[List[di
             harm_response = eval(response[metric_name])
         else:
             harm_response = ""
-    except:
+    except Exception:
         harm_response = response[metric_name]
 
     if harm_response != "" and isinstance(harm_response, dict):
-        ### check if "output" is one key in harm_response
+        # check if "output" is one key in harm_response
         if "output" in harm_response:
             harm_response = harm_response["output"]
 
@@ -139,6 +143,7 @@ def parse_response(batch_response: List[dict], metric_name: str) -> List[List[di
 
     return result
 
+
 def get_rai_svc_url(project_scope: dict, credential: TokenCredential):
     from azure.ai.ml import MLClient
     ml_client = MLClient(
@@ -162,14 +167,14 @@ def get_rai_svc_url(project_scope: dict, credential: TokenCredential):
 
     return rai_url
 
+
 @tool
 def evaluate_with_rai_service(
-    question: str,
-    answer: str,
-    metric_name: str,
-    project_scope: dict,
-    credential: TokenCredential):
-
+        question: str,
+        answer: str,
+        metric_name: str,
+        project_scope: dict,
+        credential: TokenCredential):
     # Use DefaultAzureCredential if no credential is provided
     # This is for the for batch run scenario as the credential cannot be serialized by promoptflow
     if credential is None or credential == {}:

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py
@@ -1,6 +1,5 @@
 import constants
 import numpy as np
-import re
 
 
 def get_harm_severity_level(harm_score: int) -> str:
@@ -9,7 +8,7 @@ def get_harm_severity_level(harm_score: int) -> str:
                                    constants.HarmSeverityLevel.Medium: [4, 5],
                                    constants.HarmSeverityLevel.High: [6, 7]
                                    }
-    if harm_score == np.nan or harm_score == None:
+    if harm_score == np.nan or harm_score is None:
         return np.nan
     for harm_level, harm_score_range in HAMR_SEVERITY_LEVEL_MAPPING.items():
         if harm_score >= harm_score_range[0] and harm_score <= harm_score_range[1]:

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/validate_inputs.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/validate_inputs.py
@@ -1,9 +1,10 @@
 from promptflow import tool
 
+
 @tool
 def validate_inputs(question: str, answer: str):
     # Validate input parameters
     if not (question and question.strip()) or not (answer and answer.strip()):
         raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
 
-    return True
+    return True
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py
@@ -10,7 +10,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] =
         """
         Initialize an evaluator for hate unfairness score.
 
-        :param project_scope: The scope of the Azure AI project. It contains subscription id, resource group, and project name.
+        :param project_scope: The scope of the Azure AI project.
+            It contains subscription id, resource group, and project name.
         :type project_scope: dict
         :param credential: The credential for connecting to Azure AI project.
         :type credential: TokenCredential
@@ -50,4 +51,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
             project_scope=self._project_scope,
             credential=self._credential)
 
-        return output["result"]
+        return output["result"]
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py
@@ -10,7 +10,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] =
         """
         Initialize an evaluator for self harm score.
 
-        :param project_scope: The scope of the Azure AI project. It contains subscription id, resource group, and project name.
+        :param project_scope: The scope of the Azure AI project.
+            It contains subscription id, resource group, and project name.
         :type project_scope: dict
         :param credential: The credential for connecting to Azure AI project.
         :type credential: TokenCredential
@@ -50,4 +51,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
             project_scope=self._project_scope,
             credential=self._credential)
 
-        return output["result"]
+        return output["result"]
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py
@@ -10,7 +10,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] =
         """
         Initialize an evaluator for sexual score.
 
-        :param project_scope: The scope of the Azure AI project. It contains subscription id, resource group, and project name.
+        :param project_scope: The scope of the Azure AI project.
+            It contains subscription id, resource group, and project name.
         :type project_scope: dict
         :param credential: The credential for connecting to Azure AI project.
         :type credential: TokenCredential
@@ -50,4 +51,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
             project_scope=self._project_scope,
             credential=self._credential)
 
-        return output["result"]
+        return output["result"]