Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding promptflow-evals #2597

Merged
merged 3 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,9 @@
"pywin",
"STARTF",
"mltable",
"setenv"
"setenv",
"raisvc",
"evals"
],
"flagWords": [
"Prompt Flow"
Expand Down
Empty file.
Empty file.
Empty file.
11 changes: 11 additions & 0 deletions src/promptflow-evals/promptflow/evals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore

from .evaluate import evaluate

__all__ = [
"evaluate"
]
12 changes: 12 additions & 0 deletions src/promptflow-evals/promptflow/evals/_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
class EvaluationMetrics:
GPT_GROUNDEDNESS = "gpt_groundedness"
GPT_RELEVANCE = "gpt_relevance"
GPT_COHERENCE = "gpt_coherence"
GPT_FLUENCY = "gpt_fluency"
GPT_SIMILARITY = "gpt_similarity"
F1_SCORE = "f1_score"
GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
HATE_FAIRNESS = "hate_fairness"
VIOLENCE = "violence"
SELF_HARM = "self_harm"
SEXUAL = "sexual"
9 changes: 9 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

__path__ = __import__("pkgutil").extend_path(__path__, __name__)

from ._evaluate import evaluate

__all__ = ["evaluate"]
109 changes: 109 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing import Optional, Dict, Callable
import pandas as pd
from ._flow_run_wrapper import FlowRunWrapper
from promptflow import PFClient


def _calculate_mean(df) -> Dict[str, float]:
mean_value = df.mean(numeric_only=True)
return mean_value.to_dict()


def _validation(target, data, evaluators, output_path, tracking_uri, evaluation_name):
if target is None and data is None:
raise ValueError("Either target or data must be provided for evaluation.")

if target is not None:
if not callable(target):
raise ValueError("target must be a callable function.")

if data is not None:
if not isinstance(data, str):
raise ValueError("data must be a string.")

if evaluators is not None:
if not isinstance(evaluators, dict):
raise ValueError("evaluators must be a dictionary.")

if output_path is not None:
if not isinstance(output_path, str):
raise ValueError("output_path must be a string.")

if tracking_uri is not None:
if not isinstance(tracking_uri, str):
raise ValueError("tracking_uri must be a string.")

if evaluation_name is not None:
if not isinstance(evaluation_name, str):
raise ValueError("evaluation_name must be a string.")


def evaluate(
*,
evaluation_name: Optional[str] = None,
target: Optional[Callable] = None,
data: Optional[str] = None,
evaluators: Optional[Dict[str, Callable]] = None,
evaluator_config: Optional[Dict[str, Dict[str, str]]] = {},
tracking_uri: Optional[str] = None,
output_path: Optional[str] = None,
**kwargs,
):
"""Evaluates target or data with built-in evaluation metrics

:keyword evaluation_name: Display name of the evaluation.
:paramtype evaluation_name: Optional[str]
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
:paramtype target: Optional[Callable]
:keyword data: Path to the data to be evaluated or passed to target if target is set.
Only .jsonl format files are supported. `target` and `data` both cannot be None
:paramtype data: Optional[str]
:keyword evaluator_config: Configuration for evaluators.
:paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
:keyword output_path: The local folder path to save evaluation artifacts to if set
:paramtype output_path: Optional[str]
:keyword tracking_uri: Tracking uri to log evaluation results to AI Studio
:paramtype tracking_uri: Optional[str]
:return: A EvaluationResult object.
:rtype: ~azure.ai.generative.evaluate.EvaluationResult
"""

_validation(target, data, evaluators, output_path, tracking_uri, evaluation_name)

evaluator_run_list = []
pf_client = PFClient()

for evaluator_name, evaluator in evaluators.items():
evaluator_run_list.append(FlowRunWrapper(pf_client.run(
flow=evaluator,
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
data=data,
stream=True
),
prefix=evaluator_name
))

result_df = None
for eval_run in evaluator_run_list:
if result_df is None:
result_df = eval_run.get_result_df(all_results=True, exclude_inputs=True)
else:
result_df = pd.concat(
[eval_run.get_result_df(all_results=True, exclude_inputs=True), result_df],
axis=1,
verify_integrity=True
)

input_data_df = pd.read_json(data, lines=True)
input_data_df = input_data_df.rename(columns={col: f"inputs.{col}" for col in input_data_df.columns})

row_results = pd.concat([input_data_df, result_df], axis=1, verify_integrity=True)

return {
"rows": row_results.to_dict("records"),
"metrics": _calculate_mean(result_df),
"traces": {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import time
from promptflow import PFClient


class FlowRunWrapper(object):
def __init__(self, flow_run, prefix=None, **kwargs):
self.flow_run = flow_run
self.column_mapping = flow_run.column_mapping
self.prefix = prefix if prefix is not None else ""
self.client = PFClient()

def get_result_df(self, all_results=True, exclude_inputs=False):
self._wait_for_completion()
result_df = self.client.get_details(self.flow_run.name, all_results=all_results)
if exclude_inputs:
result_df = result_df.drop(
columns=[col for col in result_df.columns if col.startswith("inputs.")]
)
result_df.rename(
columns={col: col.replace("outputs", self.prefix)
for col in [col for col in result_df.columns if col.startswith("outputs.")]},
inplace=True)
return result_df

def _wait_for_completion(self):
from promptflow._sdk._constants import RunStatus
while True:
if self.run.status in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELED]:
break
time.sleep(2)
9 changes: 9 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluate/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import json


def load_jsonl(path):
with open(path, "r", encoding="utf-8") as f:
return [json.loads(line) for line in f.readlines()]
27 changes: 27 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore


from .coherence import CoherenceEvaluator
from .f1_score import F1ScoreEvaluator
from .fluency import FluencyEvaluator
from .groundedness import GroundednessEvaluator
from .relevance import RelevanceEvaluator
from .similarity import SimilarityEvaluator
from .qa import QAEvaluator
from .chat import ChatEvaluator


__all__ = [
"CoherenceEvaluator",
"F1ScoreEvaluator",
"FluencyEvaluator",
"GroundednessEvaluator",
"RelevanceEvaluator",
"SimilarityEvaluator",
"QAEvaluator",
"ChatEvaluator",
]
Loading
Loading