microsoft · singankit · Apr 2, 2024 · Apr 1, 2024 · Apr 1, 2024 · Apr 2, 2024
@@ -189,7 +189,9 @@
     "pywin",
     "STARTF",
     "mltable",
-    "setenv"
+    "setenv",
+    "raisvc",
+    "evals"
   ],
   "flagWords": [
     "Prompt Flow"

@@ -0,0 +1,11 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
+
+from .evaluate import evaluate
+
+__all__ = [
+    "evaluate"
+]
@@ -0,0 +1,12 @@
+class EvaluationMetrics:
+    GPT_GROUNDEDNESS = "gpt_groundedness"
+    GPT_RELEVANCE = "gpt_relevance"
+    GPT_COHERENCE = "gpt_coherence"
+    GPT_FLUENCY = "gpt_fluency"
+    GPT_SIMILARITY = "gpt_similarity"
+    F1_SCORE = "f1_score"
+    GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
+    HATE_FAIRNESS = "hate_fairness"
+    VIOLENCE = "violence"
+    SELF_HARM = "self_harm"
+    SEXUAL = "sexual"
@@ -0,0 +1,9 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
+from ._evaluate import evaluate
+
+__all__ = ["evaluate"]
@@ -0,0 +1,109 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Optional, Dict, Callable
+import pandas as pd
+from ._flow_run_wrapper import FlowRunWrapper
+from promptflow import PFClient
+
+
+def _calculate_mean(df) -> Dict[str, float]:
+    mean_value = df.mean(numeric_only=True)
+    return mean_value.to_dict()
+
+
+def _validation(target, data, evaluators, output_path, tracking_uri, evaluation_name):
+    if target is None and data is None:
+        raise ValueError("Either target or data must be provided for evaluation.")
+
+    if target is not None:
+        if not callable(target):
+            raise ValueError("target must be a callable function.")
+
+    if data is not None:
+        if not isinstance(data, str):
+            raise ValueError("data must be a string.")
+
+    if evaluators is not None:
+        if not isinstance(evaluators, dict):
+            raise ValueError("evaluators must be a dictionary.")
+
+    if output_path is not None:
+        if not isinstance(output_path, str):
+            raise ValueError("output_path must be a string.")
+
+    if tracking_uri is not None:
+        if not isinstance(tracking_uri, str):
+            raise ValueError("tracking_uri must be a string.")
+
+    if evaluation_name is not None:
+        if not isinstance(evaluation_name, str):
+            raise ValueError("evaluation_name must be a string.")
+
+
+def evaluate(
+        *,
+        evaluation_name: Optional[str] = None,
+        target: Optional[Callable] = None,
+        data: Optional[str] = None,
+        evaluators: Optional[Dict[str, Callable]] = None,
+        evaluator_config: Optional[Dict[str, Dict[str, str]]] = {},
+        tracking_uri: Optional[str] = None,
+        output_path: Optional[str] = None,
+        **kwargs,
+):
+    """Evaluates target or data with built-in evaluation metrics
+
+    :keyword evaluation_name: Display name of the evaluation.
+    :paramtype evaluation_name: Optional[str]
+    :keyword target: Target to be evaluated. `target` and `data` both cannot be None
+    :paramtype target: Optional[Callable]
+    :keyword data: Path to the data to be evaluated or passed to target if target is set.
+        Only .jsonl format files are supported.  `target` and `data` both cannot be None
+    :paramtype data: Optional[str]
+    :keyword evaluator_config: Configuration for evaluators.
+    :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
+    :keyword output_path: The local folder path to save evaluation artifacts to if set
+    :paramtype output_path: Optional[str]
+    :keyword tracking_uri: Tracking uri to log evaluation results to AI Studio
+    :paramtype tracking_uri: Optional[str]
+    :return: A EvaluationResult object.
+    :rtype: ~azure.ai.generative.evaluate.EvaluationResult
+    """
+
+    _validation(target, data, evaluators, output_path, tracking_uri, evaluation_name)
+
+    evaluator_run_list = []
+    pf_client = PFClient()
+
+    for evaluator_name, evaluator in evaluators.items():
+        evaluator_run_list.append(FlowRunWrapper(pf_client.run(
+            flow=evaluator,
+            column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
+            data=data,
+            stream=True
+        ),
+            prefix=evaluator_name
+        ))
+
+    result_df = None
+    for eval_run in evaluator_run_list:
+        if result_df is None:
+            result_df = eval_run.get_result_df(all_results=True, exclude_inputs=True)
+        else:
+            result_df = pd.concat(
+                [eval_run.get_result_df(all_results=True, exclude_inputs=True), result_df],
+                axis=1,
+                verify_integrity=True
+            )
+
+    input_data_df = pd.read_json(data, lines=True)
+    input_data_df = input_data_df.rename(columns={col: f"inputs.{col}" for col in input_data_df.columns})
+
+    row_results = pd.concat([input_data_df, result_df], axis=1, verify_integrity=True)
+
+    return {
+        "rows": row_results.to_dict("records"),
+        "metrics": _calculate_mean(result_df),
+        "traces": {}
+    }
@@ -0,0 +1,33 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import time
+from promptflow import PFClient
+
+
+class FlowRunWrapper(object):
+    def __init__(self, flow_run, prefix=None, **kwargs):
+        self.flow_run = flow_run
+        self.column_mapping = flow_run.column_mapping
+        self.prefix = prefix if prefix is not None else ""
+        self.client = PFClient()
+
+    def get_result_df(self, all_results=True, exclude_inputs=False):
+        self._wait_for_completion()
+        result_df = self.client.get_details(self.flow_run.name, all_results=all_results)
+        if exclude_inputs:
+            result_df = result_df.drop(
+                columns=[col for col in result_df.columns if col.startswith("inputs.")]
+            )
+        result_df.rename(
+            columns={col: col.replace("outputs", self.prefix)
+                     for col in [col for col in result_df.columns if col.startswith("outputs.")]},
+            inplace=True)
+        return result_df
+
+    def _wait_for_completion(self):
+        from promptflow._sdk._constants import RunStatus
+        while True:
+            if self.run.status in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELED]:
+                break
+            time.sleep(2)
@@ -0,0 +1,9 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import json
+
+
+def load_jsonl(path):
+    with open(path, "r", encoding="utf-8") as f:
+        return [json.loads(line) for line in f.readlines()]
@@ -0,0 +1,27 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
+
+
+from .coherence import CoherenceEvaluator
+from .f1_score import F1ScoreEvaluator
+from .fluency import FluencyEvaluator
+from .groundedness import GroundednessEvaluator
+from .relevance import RelevanceEvaluator
+from .similarity import SimilarityEvaluator
+from .qa import QAEvaluator
+from .chat import ChatEvaluator
+
+
+__all__ = [
+    "CoherenceEvaluator",
+    "F1ScoreEvaluator",
+    "FluencyEvaluator",
+    "GroundednessEvaluator",
+    "RelevanceEvaluator",
+    "SimilarityEvaluator",
+    "QAEvaluator",
+    "ChatEvaluator",
+]