Skip to content

Commit 0785495

Browse files
authored
Adding promptflow-evals (#2597)
# Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
1 parent 3af352f commit 0785495

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+2416
-1
lines changed

.cspell.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,9 @@
189189
"pywin",
190190
"STARTF",
191191
"mltable",
192-
"setenv"
192+
"setenv",
193+
"raisvc",
194+
"evals"
193195
],
194196
"flagWords": [
195197
"Prompt Flow"

src/promptflow-evals/CHANGELOG.md

Whitespace-only changes.

src/promptflow-evals/MANIFEST.in

Whitespace-only changes.

src/promptflow-evals/README.md

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore
6+
7+
from .evaluate import evaluate
8+
9+
__all__ = [
10+
"evaluate"
11+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
class EvaluationMetrics:
2+
GPT_GROUNDEDNESS = "gpt_groundedness"
3+
GPT_RELEVANCE = "gpt_relevance"
4+
GPT_COHERENCE = "gpt_coherence"
5+
GPT_FLUENCY = "gpt_fluency"
6+
GPT_SIMILARITY = "gpt_similarity"
7+
F1_SCORE = "f1_score"
8+
GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
9+
HATE_FAIRNESS = "hate_fairness"
10+
VIOLENCE = "violence"
11+
SELF_HARM = "self_harm"
12+
SEXUAL = "sexual"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
6+
7+
from ._evaluate import evaluate
8+
9+
__all__ = ["evaluate"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
from typing import Optional, Dict, Callable
5+
import pandas as pd
6+
from ._flow_run_wrapper import FlowRunWrapper
7+
from promptflow import PFClient
8+
9+
10+
def _calculate_mean(df) -> Dict[str, float]:
11+
mean_value = df.mean(numeric_only=True)
12+
return mean_value.to_dict()
13+
14+
15+
def _validation(target, data, evaluators, output_path, tracking_uri, evaluation_name):
16+
if target is None and data is None:
17+
raise ValueError("Either target or data must be provided for evaluation.")
18+
19+
if target is not None:
20+
if not callable(target):
21+
raise ValueError("target must be a callable function.")
22+
23+
if data is not None:
24+
if not isinstance(data, str):
25+
raise ValueError("data must be a string.")
26+
27+
if evaluators is not None:
28+
if not isinstance(evaluators, dict):
29+
raise ValueError("evaluators must be a dictionary.")
30+
31+
if output_path is not None:
32+
if not isinstance(output_path, str):
33+
raise ValueError("output_path must be a string.")
34+
35+
if tracking_uri is not None:
36+
if not isinstance(tracking_uri, str):
37+
raise ValueError("tracking_uri must be a string.")
38+
39+
if evaluation_name is not None:
40+
if not isinstance(evaluation_name, str):
41+
raise ValueError("evaluation_name must be a string.")
42+
43+
44+
def evaluate(
45+
*,
46+
evaluation_name: Optional[str] = None,
47+
target: Optional[Callable] = None,
48+
data: Optional[str] = None,
49+
evaluators: Optional[Dict[str, Callable]] = None,
50+
evaluator_config: Optional[Dict[str, Dict[str, str]]] = {},
51+
tracking_uri: Optional[str] = None,
52+
output_path: Optional[str] = None,
53+
**kwargs,
54+
):
55+
"""Evaluates target or data with built-in evaluation metrics
56+
57+
:keyword evaluation_name: Display name of the evaluation.
58+
:paramtype evaluation_name: Optional[str]
59+
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
60+
:paramtype target: Optional[Callable]
61+
:keyword data: Path to the data to be evaluated or passed to target if target is set.
62+
Only .jsonl format files are supported. `target` and `data` both cannot be None
63+
:paramtype data: Optional[str]
64+
:keyword evaluator_config: Configuration for evaluators.
65+
:paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
66+
:keyword output_path: The local folder path to save evaluation artifacts to if set
67+
:paramtype output_path: Optional[str]
68+
:keyword tracking_uri: Tracking uri to log evaluation results to AI Studio
69+
:paramtype tracking_uri: Optional[str]
70+
:return: A EvaluationResult object.
71+
:rtype: ~azure.ai.generative.evaluate.EvaluationResult
72+
"""
73+
74+
_validation(target, data, evaluators, output_path, tracking_uri, evaluation_name)
75+
76+
evaluator_run_list = []
77+
pf_client = PFClient()
78+
79+
for evaluator_name, evaluator in evaluators.items():
80+
evaluator_run_list.append(FlowRunWrapper(pf_client.run(
81+
flow=evaluator,
82+
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
83+
data=data,
84+
stream=True
85+
),
86+
prefix=evaluator_name
87+
))
88+
89+
result_df = None
90+
for eval_run in evaluator_run_list:
91+
if result_df is None:
92+
result_df = eval_run.get_result_df(all_results=True, exclude_inputs=True)
93+
else:
94+
result_df = pd.concat(
95+
[eval_run.get_result_df(all_results=True, exclude_inputs=True), result_df],
96+
axis=1,
97+
verify_integrity=True
98+
)
99+
100+
input_data_df = pd.read_json(data, lines=True)
101+
input_data_df = input_data_df.rename(columns={col: f"inputs.{col}" for col in input_data_df.columns})
102+
103+
row_results = pd.concat([input_data_df, result_df], axis=1, verify_integrity=True)
104+
105+
return {
106+
"rows": row_results.to_dict("records"),
107+
"metrics": _calculate_mean(result_df),
108+
"traces": {}
109+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
import time
5+
from promptflow import PFClient
6+
7+
8+
class FlowRunWrapper(object):
9+
def __init__(self, flow_run, prefix=None, **kwargs):
10+
self.flow_run = flow_run
11+
self.column_mapping = flow_run.column_mapping
12+
self.prefix = prefix if prefix is not None else ""
13+
self.client = PFClient()
14+
15+
def get_result_df(self, all_results=True, exclude_inputs=False):
16+
self._wait_for_completion()
17+
result_df = self.client.get_details(self.flow_run.name, all_results=all_results)
18+
if exclude_inputs:
19+
result_df = result_df.drop(
20+
columns=[col for col in result_df.columns if col.startswith("inputs.")]
21+
)
22+
result_df.rename(
23+
columns={col: col.replace("outputs", self.prefix)
24+
for col in [col for col in result_df.columns if col.startswith("outputs.")]},
25+
inplace=True)
26+
return result_df
27+
28+
def _wait_for_completion(self):
29+
from promptflow._sdk._constants import RunStatus
30+
while True:
31+
if self.run.status in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELED]:
32+
break
33+
time.sleep(2)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
import json
5+
6+
7+
def load_jsonl(path):
8+
with open(path, "r", encoding="utf-8") as f:
9+
return [json.loads(line) for line in f.readlines()]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore
6+
7+
8+
from .coherence import CoherenceEvaluator
9+
from .f1_score import F1ScoreEvaluator
10+
from .fluency import FluencyEvaluator
11+
from .groundedness import GroundednessEvaluator
12+
from .relevance import RelevanceEvaluator
13+
from .similarity import SimilarityEvaluator
14+
from .qa import QAEvaluator
15+
from .chat import ChatEvaluator
16+
17+
18+
__all__ = [
19+
"CoherenceEvaluator",
20+
"F1ScoreEvaluator",
21+
"FluencyEvaluator",
22+
"GroundednessEvaluator",
23+
"RelevanceEvaluator",
24+
"SimilarityEvaluator",
25+
"QAEvaluator",
26+
"ChatEvaluator",
27+
]

0 commit comments

Comments
 (0)