Skip to content

Commit

Permalink
Flake8 error fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
singankit committed Apr 1, 2024
1 parent 5919f7b commit 001b91d
Show file tree
Hide file tree
Showing 32 changed files with 138 additions and 98 deletions.
1 change: 0 additions & 1 deletion src/promptflow-evals/promptflow/evals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@
__all__ = [
"evaluate"
]

2 changes: 1 addition & 1 deletion src/promptflow-evals/promptflow/evals/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ class EvaluationMetrics:
HATE_FAIRNESS = "hate_fairness"
VIOLENCE = "violence"
SELF_HARM = "self_harm"
SEXUAL = "sexual"
SEXUAL = "sexual"
9 changes: 1 addition & 8 deletions src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import json
from json import JSONDecodeError
from pathlib import Path
from typing import Optional, Dict, Union, Callable

import numpy as np
from typing import Optional, Dict, Callable
import pandas as pd

from ._utils import load_jsonl
from ._flow_run_wrapper import FlowRunWrapper
from promptflow import PFClient

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ def get_result_df(self, all_results=True, exclude_inputs=False):
result_df = result_df.drop(
columns=[col for col in result_df.columns if col.startswith("inputs.")]
)
result_df.rename(columns={col: col.replace("outputs", self.prefix) for col in [col for col in result_df.columns if col.startswith("outputs.")]}, inplace=True)
result_df.rename(
columns={col: col.replace("outputs", self.prefix)
for col in [col for col in result_df.columns if col.startswith("outputs.")]},
inplace=True)
return result_df

def _wait_for_completion(self):
Expand Down
43 changes: 29 additions & 14 deletions src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import logging
import numpy as np


logger = logging.getLogger(__name__)


Expand All @@ -30,9 +29,11 @@ def __init__(
:type model_config: AzureOpenAIConnection
:param deployment_name: Deployment to be used which has Azure OpenAI model.
:type deployment_name: AzureOpenAIConnection
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
:type eval_last_turn: bool
:param parallel: If True, use parallel execution for evaluators. Else, use sequential execution. Default is True.
:param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
Default is True.
:type parallel: bool
:return: A function that evaluates and generates metrics for "chat" scenario.
:rtype: function
Expand All @@ -45,15 +46,19 @@ def __init__(
conversation = [
{"role": "user", "content": "What is the value of 2 + 2?"},
{"role": "assistant", "content": "2 + 2 = 4", "context": {
"citations": [{"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]}}
"citations": [
{"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
]
}
}
]
result = chat_eval(conversation=conversation)
"""
self._eval_last_turn = eval_last_turn
self._parallel = parallel

# TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
self._rag_evaluators = [
self._rag_evaluators = [
GroundednessEvaluator(model_config, deployment_name=deployment_name),
RelevanceEvaluator(model_config, deployment_name=deployment_name),
]
Expand All @@ -66,7 +71,7 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
"""Evaluates chat scenario.
:param conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
"context" key is optional for assistant's turn and should have "citations" key with list of citations.
"context" key is optional for assistant's turn and should have "citations" key with list of citations.
:type conversation: List[Dict]
:return: The scores for Chat scenario.
:rtype: dict
Expand Down Expand Up @@ -99,7 +104,7 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
compute_rag_based_metrics = True
if len(answers) != len(contexts):
safe_message = "Skipping rag based metrics as we need citations or " \
"retrieved_documents in context key of every assistant's turn"
"retrieved_documents in context key of every assistant's turn"
logger.warning(safe_message)
compute_rag_based_metrics = False

Expand All @@ -117,7 +122,8 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
# Parallel execution
with ThreadPoolExecutor() as executor:
future_to_evaluator = {
executor.submit(self._evaluate_turn, turn_num, questions, answers, contexts, evaluator): evaluator
executor.submit(self._evaluate_turn, turn_num, questions, answers, contexts, evaluator)
: evaluator
for evaluator in selected_evaluators
}

Expand Down Expand Up @@ -160,7 +166,8 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):

return score
except Exception as e:
logger.warning(f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
logger.warning(
f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
return {}

def _validate_conversation(self, conversation: List[Dict]):
Expand All @@ -175,27 +182,35 @@ def _validate_conversation(self, conversation: List[Dict]):
raise ValueError(f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}")

if "role" not in turn or "content" not in turn:
raise ValueError(f"Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: {one_based_turn_num}")
raise ValueError(
f"Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: "
f"{one_based_turn_num}")

if turn["role"] != expected_role:
raise ValueError(f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}")
raise ValueError(
f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}")

if not isinstance(turn["content"], str):
raise ValueError(f"Content in each turn must be a string. Turn number: {one_based_turn_num}")

if turn["role"] == "assistant" and "context" in turn:
if not isinstance(turn["context"], dict):
raise ValueError(f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}")
raise ValueError(
f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}")

if "citations" not in turn["context"]:
raise ValueError(f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}")
raise ValueError(
f"Context in each assistant's turn must have 'citations' key. Turn number:"
f" {one_based_turn_num}")

if not isinstance(turn["context"]["citations"], list):
raise ValueError(f"'citations' in context must be a list. Turn number: {one_based_turn_num}")

for citation_num, citation in enumerate(turn["context"]["citations"]):
if not isinstance(citation, dict):
raise ValueError(f"Each citation in 'citations' must be a dictionary. Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}")
raise ValueError(
f"Each citation in 'citations' must be a dictionary. Turn number: {one_based_turn_num},"
f" Citation number: {citation_num + 1}")

# Toggle expected role for the next turn
expected_role = "user" if expected_role == "assistant" else "assistant"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
"""

# Run the evaluation flow
return self._flow(question=question, answer=answer)
return self._flow(question=question, answer=answer)
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from promptflow import tool


@tool
def validate_inputs(question: str, answer: str):
# Validate input parameters
if not (question and question.strip()) or not (answer and answer.strip()):
raise ValueError("Both 'question' and 'answer' must be non-empty strings.")

return True
return True
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@

__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore


from .violence import ViolenceEvaluator
from .sexual import SexualEvaluator
from.self_harm import SelfHarmEvaluator
from .self_harm import SelfHarmEvaluator
from .hate_unfairness import HateUnfairnessEvaluator


__all__ = [
"ViolenceEvaluator",
"SexualEvaluator",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
from enum import Enum


class RAIService:
"""Define constants related to RAI service"""
TIMEOUT = 1800
SLEEP_TIME = 2
HARM_SEVERITY_THRESHOLD = 4


class HarmSeverityLevel(Enum):
Safe = 0
Low = 1
Medium = 2
High = 3


class Tasks:
"""Defines types of annotation tasks supported by RAI Service."""
CONTENT_HARM = "content harm"
CONTENT_HARM = "content harm"
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
from azure.identity import DefaultAzureCredential
import requests


def ensure_service_availability(rai_svc_url: str):
svc_liveness_url = rai_svc_url.split("/subscriptions")[0] + "/meta/version"
response = requests.get(svc_liveness_url)
if response.status_code != 200:
raise Exception("RAI service is not available in this region")


def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, credential: TokenCredential):
user_text = f"<Human>{question}</><System>{answer}</>"
normalized_user_text = user_text.replace("'", "\\\"")
Expand All @@ -33,13 +35,14 @@ def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, cr

response = requests.post(url, json=payload, headers=headers)
if response.status_code != 202:
print("Fail evaluating '%s' with error message: %s" %(payload["UserTextList"], response.text))
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
response.raise_for_status()

result = response.json()
operation_id = result["location"].split("/")[-1]
return operation_id


def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential):
start = time.time()
request_count = 0
Expand All @@ -64,6 +67,7 @@ def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredentia
sleep_time = RAIService.SLEEP_TIME ** request_count
time.sleep(sleep_time)


def parse_response(batch_response: List[dict], metric_name: str) -> List[List[dict]]:
# Fix the metric name if it's "hate_fairness"
# Eventually we will remove this fix once the RAI service is updated
Expand Down Expand Up @@ -92,11 +96,11 @@ def parse_response(batch_response: List[dict], metric_name: str) -> List[List[di
harm_response = eval(response[metric_name])
else:
harm_response = ""
except:
except Exception:
harm_response = response[metric_name]

if harm_response != "" and isinstance(harm_response, dict):
### check if "output" is one key in harm_response
# check if "output" is one key in harm_response
if "output" in harm_response:
harm_response = harm_response["output"]

Expand Down Expand Up @@ -139,6 +143,7 @@ def parse_response(batch_response: List[dict], metric_name: str) -> List[List[di

return result


def get_rai_svc_url(project_scope: dict, credential: TokenCredential):
from azure.ai.ml import MLClient
ml_client = MLClient(
Expand All @@ -162,14 +167,14 @@ def get_rai_svc_url(project_scope: dict, credential: TokenCredential):

return rai_url


@tool
def evaluate_with_rai_service(
question: str,
answer: str,
metric_name: str,
project_scope: dict,
credential: TokenCredential):

question: str,
answer: str,
metric_name: str,
project_scope: dict,
credential: TokenCredential):
# Use DefaultAzureCredential if no credential is provided
# This is for the for batch run scenario as the credential cannot be serialized by promoptflow
if credential is None or credential == {}:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import constants
import numpy as np
import re


def get_harm_severity_level(harm_score: int) -> str:
Expand All @@ -9,7 +8,7 @@ def get_harm_severity_level(harm_score: int) -> str:
constants.HarmSeverityLevel.Medium: [4, 5],
constants.HarmSeverityLevel.High: [6, 7]
}
if harm_score == np.nan or harm_score == None:
if harm_score == np.nan or harm_score is None:
return np.nan
for harm_level, harm_score_range in HAMR_SEVERITY_LEVEL_MAPPING.items():
if harm_score >= harm_score_range[0] and harm_score <= harm_score_range[1]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from promptflow import tool


@tool
def validate_inputs(question: str, answer: str):
# Validate input parameters
if not (question and question.strip()) or not (answer and answer.strip()):
raise ValueError("Both 'question' and 'answer' must be non-empty strings.")

return True
return True
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] =
"""
Initialize an evaluator for hate unfairness score.
:param project_scope: The scope of the Azure AI project. It contains subscription id, resource group, and project name.
:param project_scope: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type project_scope: dict
:param credential: The credential for connecting to Azure AI project.
:type credential: TokenCredential
Expand Down Expand Up @@ -50,4 +51,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
project_scope=self._project_scope,
credential=self._credential)

return output["result"]
return output["result"]
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] =
"""
Initialize an evaluator for self harm score.
:param project_scope: The scope of the Azure AI project. It contains subscription id, resource group, and project name.
:param project_scope: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type project_scope: dict
:param credential: The credential for connecting to Azure AI project.
:type credential: TokenCredential
Expand Down Expand Up @@ -50,4 +51,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
project_scope=self._project_scope,
credential=self._credential)

return output["result"]
return output["result"]
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] =
"""
Initialize an evaluator for sexual score.
:param project_scope: The scope of the Azure AI project. It contains subscription id, resource group, and project name.
:param project_scope: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type project_scope: dict
:param credential: The credential for connecting to Azure AI project.
:type credential: TokenCredential
Expand Down Expand Up @@ -50,4 +51,4 @@ def __call__(self, *, question: str, answer: str, **kwargs):
project_scope=self._project_scope,
credential=self._credential)

return output["result"]
return output["result"]
Loading

0 comments on commit 001b91d

Please sign in to comment.