Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[promptflow-evals] Switch to use AzureOpenAIModelConfiguration for built-in evaluators #2808

Merged
merged 7 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions .github/workflows/promptflow-evals-e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
# TODO: Following up with PF team for the attribute error from 3.8 and 3.9.
python-version: ['3.10', '3.11']
#python-version: ['3.8', '3.9', '3.10', '3.11']
python-version: ['3.8', '3.9', '3.10', '3.11']
fail-fast: false
# snok/install-poetry need this to support Windows
defaults:
Expand All @@ -58,8 +56,10 @@ jobs:
path: ${{ env.WORKING_DIRECTORY }}
- name: install promptflow packages in editable mode
run: |
poetry run pip install -e ../promptflow
poetry run pip install -e ../promptflow-core
poetry run pip install -e ../promptflow-devkit
poetry run pip install -e ../promptflow-tracing
poetry run pip install -e ../promptflow-tools
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: install promptflow-evals from wheel
Expand All @@ -73,8 +73,7 @@ jobs:
run: poetry install
working-directory: ${{ env.RECORD_DIRECTORY }}
- name: generate end-to-end test config from secret
# TODO: replace with evals secret
run: echo '${{ secrets.PF_TRACING_E2E_TEST_CONFIG }}' >> connections.json
run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: run e2e tests
run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/promptflow-evals-unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ jobs:
path: ${{ env.WORKING_DIRECTORY }}
- name: install promptflow packages in editable mode
run: |
poetry run pip install -e ../promptflow
poetry run pip install -e ../promptflow-core
poetry run pip install -e ../promptflow-devkit
poetry run pip install -e ../promptflow-tracing
poetry run pip install -e ../promptflow-tools
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: install promptflow-evals from wheel
Expand Down
12 changes: 10 additions & 2 deletions scripts/dev-setup/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,19 @@ def create_evals_test_resource_template() -> None:
connections_filename = "connections.json"
connections_file_path = (working_dir / connections_filename).resolve().absolute()
connections_template = {
"azure_open_ai_connection": {
"azure_openai_model_config": {
"value": {
"azure_endpoint": "aoai-api-endpoint",
"api_key": "aoai-api-key",
"api_base": "aoai-api-endpoint",
"api_version": "2023-07-01-preview",
"azure_deployment": "aoai-deployment"
},
},
"azure_ai_project_scope": {
"value": {
"subscription_id": "subscription-id",
"resource_group_name": "resource-group-name",
"project_name": "project-name"
}
}
}
Expand Down
20 changes: 18 additions & 2 deletions src/promptflow-core/promptflow/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,29 @@
__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore

from promptflow._core.metric_logger import log_metric
from ._version import __version__

# flake8: noqa
from promptflow._core.tool import ToolProvider, tool
from promptflow.core._flow import AsyncFlow, Flow
from promptflow.core._model_configuration import (
AzureOpenAIModelConfiguration,
ModelConfiguration,
OpenAIModelConfiguration,
)

from ._version import __version__

# backward compatibility
log_flow_metric = log_metric

__all__ = ["log_metric", "ToolProvider", "tool", "Flow", "AsyncFlow", "__version__"]
__all__ = [
"log_metric",
"ToolProvider",
"tool",
"Flow",
"AsyncFlow",
"ModelConfiguration",
"OpenAIModelConfiguration",
"AzureOpenAIModelConfiguration",
"__version__",
]
67 changes: 35 additions & 32 deletions src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,28 @@

__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore

from promptflow.connections import AzureOpenAIConnection
from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List

import numpy as np

from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.evals.evaluators import CoherenceEvaluator, FluencyEvaluator, GroundednessEvaluator, RelevanceEvaluator

logger = logging.getLogger(__name__)


class ChatEvaluator:
def __init__(
self,
model_config: AzureOpenAIConnection,
deployment_name: str,
eval_last_turn: bool = False,
parallel: bool = True):
self, model_config: AzureOpenAIModelConfiguration, eval_last_turn: bool = False, parallel: bool = True
):
"""
Initialize an evaluator configured for a specific Azure OpenAI model.

:param model_config: Configuration for the Azure OpenAI model.
:type model_config: AzureOpenAIConnection
:param deployment_name: Deployment to be used which has Azure OpenAI model.
:type deployment_name: AzureOpenAIConnection
:type model_config: AzureOpenAIModelConfiguration
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
:type eval_last_turn: bool
Expand All @@ -42,7 +39,7 @@ def __init__(

.. code-block:: python

eval_fn = ChatEvaluator(model_config, deployment_name="gpt-4")
eval_fn = ChatEvaluator(model_config)
conversation = [
{"role": "user", "content": "What is the value of 2 + 2?"},
{"role": "assistant", "content": "2 + 2 = 4", "context": {
Expand All @@ -59,12 +56,12 @@ def __init__(

# TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
self._rag_evaluators = [
GroundednessEvaluator(model_config, deployment_name=deployment_name),
RelevanceEvaluator(model_config, deployment_name=deployment_name),
GroundednessEvaluator(model_config),
RelevanceEvaluator(model_config),
]
self._non_rag_evaluators = [
CoherenceEvaluator(model_config, deployment_name=deployment_name),
FluencyEvaluator(model_config, deployment_name=deployment_name),
CoherenceEvaluator(model_config),
FluencyEvaluator(model_config),
]

def __call__(self, *, conversation: List[Dict], **kwargs):
Expand Down Expand Up @@ -103,8 +100,10 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
# Select evaluators to be used for evaluation
compute_rag_based_metrics = True
if len(answers) != len(contexts):
safe_message = "Skipping rag based metrics as we need citations or " \
"retrieved_documents in context key of every assistant's turn"
safe_message = (
"Skipping rag based metrics as we need citations or "
"retrieved_documents in context key of every assistant's turn"
)
logger.warning(safe_message)
compute_rag_based_metrics = False

Expand All @@ -122,8 +121,9 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
# Parallel execution
with ThreadPoolExecutor() as executor:
future_to_evaluator = {
executor.submit(self._evaluate_turn, turn_num, questions, answers, contexts, evaluator)
: evaluator
executor.submit(
self._evaluate_turn, turn_num, questions, answers, contexts, evaluator
): evaluator
for evaluator in selected_evaluators
}

Expand Down Expand Up @@ -158,15 +158,13 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
answer = answers[turn_num] if turn_num < len(answers) else ""
context = contexts[turn_num] if turn_num < len(contexts) else ""

score = evaluator(
question=question,
answer=answer,
context=context)
score = evaluator(question=question, answer=answer, context=context)

return score
except Exception as e:
logger.warning(
f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}"
)
return {}

def _aggregate_results(self, per_turn_results: List[Dict]):
Expand All @@ -175,7 +173,7 @@ def _aggregate_results(self, per_turn_results: List[Dict]):

for turn in per_turn_results:
for metric, value in turn.items():
if 'reason' in metric:
if "reason" in metric:
if metric not in reasons:
reasons[metric] = []
reasons[metric].append(value)
Expand Down Expand Up @@ -214,24 +212,28 @@ def _validate_conversation(self, conversation: List[Dict]):
if "role" not in turn or "content" not in turn:
raise ValueError(
f"Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: "
f"{one_based_turn_num}")
f"{one_based_turn_num}"
)

if turn["role"] != expected_role:
raise ValueError(
f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}")
f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}"
)

if not isinstance(turn["content"], str):
raise ValueError(f"Content in each turn must be a string. Turn number: {one_based_turn_num}")

if turn["role"] == "assistant" and "context" in turn:
if not isinstance(turn["context"], dict):
raise ValueError(
f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}")
f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}"
)

if "citations" not in turn["context"]:
raise ValueError(
f"Context in each assistant's turn must have 'citations' key. Turn number:"
f" {one_based_turn_num}")
f" {one_based_turn_num}"
)

if not isinstance(turn["context"]["citations"], list):
raise ValueError(f"'citations' in context must be a list. Turn number: {one_based_turn_num}")
Expand All @@ -240,7 +242,8 @@ def _validate_conversation(self, conversation: List[Dict]):
if not isinstance(citation, dict):
raise ValueError(
f"Each citation in 'citations' must be a dictionary. Turn number: {one_based_turn_num},"
f" Citation number: {citation_num + 1}")
f" Citation number: {citation_num + 1}"
)

# Toggle expected role for the next turn
expected_role = "user" if expected_role == "assistant" else "assistant"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,26 @@

__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore

from promptflow.client import load_flow
from promptflow.connections import AzureOpenAIConnection
from pathlib import Path

from promptflow.client import load_flow
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.core._prompty_utils import convert_model_configuration_to_connection


class CoherenceEvaluator:
def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
def __init__(self, model_config: AzureOpenAIModelConfiguration):
"""
Initialize an evaluation function configured for a specific Azure OpenAI model.
Initialize an evaluator configured for a specific Azure OpenAI model.

:param model_config: Configuration for the Azure OpenAI model.
:type model_config: AzureOpenAIConnection
:param deployment_name: Deployment to be used which has Azure OpenAI model.
:type deployment_name: AzureOpenAIConnection
:type model_config: AzureOpenAIModelConfiguration

**Usage**

.. code-block:: python

eval_fn = CoherenceEvaluator(model_config, deployment_name="gpt-4")
eval_fn = CoherenceEvaluator(model_config)
result = eval_fn(
question="What is the capital of Japan?",
answer="The capital of Japan is Tokyo.")
Expand All @@ -35,15 +35,11 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
self._flow = load_flow(source=flow_dir)

# Override the connection
connection = convert_model_configuration_to_connection(model_config)
self._flow.context.connections = {
"query_llm": {
"connection": AzureOpenAIConnection(
api_base=model_config.api_base,
api_key=model_config.api_key,
api_version=model_config.api_version,
api_type="azure"
),
"deployment_name": deployment_name,
"connection": connection,
"deployment_name": model_config.azure_deployment,
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,26 @@

__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore

from promptflow.client import load_flow
from promptflow.entities import AzureOpenAIConnection
from pathlib import Path

from promptflow.client import load_flow
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.core._prompty_utils import convert_model_configuration_to_connection


class FluencyEvaluator:
def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
def __init__(self, model_config: AzureOpenAIModelConfiguration):
"""
Initialize an evaluator configured for a specific Azure OpenAI model.

:param model_config: Configuration for the Azure OpenAI model.
:type model_config: AzureOpenAIConnection
:param deployment_name: Deployment to be used which has Azure OpenAI model.
:type deployment_name: AzureOpenAIConnection
:type model_config: AzureOpenAIModelConfiguration

**Usage**

.. code-block:: python

eval_fn = FluencyEvaluator(model_config, deployment_name="gpt-4")
eval_fn = FluencyEvaluator(model_config)
result = eval_fn(
question="What is the capital of Japan?",
answer="The capital of Japan is Tokyo.")
Expand All @@ -35,15 +35,11 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
self._flow = load_flow(source=flow_dir)

# Override the connection
connection = convert_model_configuration_to_connection(model_config)
self._flow.context.connections = {
"query_llm": {
"connection": AzureOpenAIConnection(
api_base=model_config.api_base,
api_key=model_config.api_key,
api_version=model_config.api_version,
api_type="azure"
),
"deployment_name": deployment_name,
"connection": connection,
"deployment_name": model_config.azure_deployment,
}
}

Expand Down
Loading
Loading