Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ChatEvaluator as a composite evaluator for chat scenario evaluation #2562

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
86458d7
add first built-in evaluator - groundedness
ninghu Mar 15, 2024
99ca676
cleanup
ninghu Mar 15, 2024
6bf7ec3
update setup.py
ninghu Mar 15, 2024
dde2cac
update
ninghu Mar 15, 2024
80245ed
update
ninghu Mar 15, 2024
266c081
add evaluator for f1 score and violence
ninghu Mar 19, 2024
f1b0a9e
Merge branch 'users/singankit/promptflow-eval' into users/ninhu/promp…
ninghu Mar 19, 2024
a4ccbbb
add more safe evaluators
ninghu Mar 19, 2024
3925728
add more built in evaluators
ninghu Mar 20, 2024
99484f1
Merge branch 'users/singankit/promptflow-eval' into users/ninhu/promp…
ninghu Mar 20, 2024
28793a0
move load_flow to init()
ninghu Mar 21, 2024
74f2171
hate fairness rename to hate unfairness
ninghu Mar 21, 2024
e92caa7
Change "reasoning" to "reason"
ninghu Mar 21, 2024
8fa07f0
add kwargs to eval_fn
ninghu Mar 21, 2024
5e4b693
Remove aggreation logic from all the flows
ninghu Mar 22, 2024
85c376d
Merge branch 'users/singankit/promptflow-eval' into users/ninhu/promp…
ninghu Mar 22, 2024
d54d079
update
ninghu Mar 22, 2024
8757275
make credential optional to support batch run scenario
ninghu Mar 22, 2024
aaba591
Merge branch 'users/singankit/promptflow-eval' into users/ninhu/promp…
ninghu Mar 25, 2024
c643f57
Convert evaluators to class based
ninghu Mar 27, 2024
2257ac1
update
ninghu Mar 27, 2024
f343890
Merge branch 'users/singankit/promptflow-eval' into users/ninhu/promp…
ninghu Mar 27, 2024
999ca48
Add ChatEvaluator
ninghu Mar 29, 2024
33ce128
update
ninghu Mar 29, 2024
782281d
minor update
ninghu Mar 29, 2024
b25753d
update
ninghu Mar 29, 2024
e9e2315
address the review comments
ninghu Mar 29, 2024
330beee
clean up
ninghu Apr 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .relevance import RelevanceEvaluator
from .similarity import SimilarityEvaluator
from .qa import QAEvaluator
from .chat import ChatEvaluator


__all__ = [
Expand All @@ -22,4 +23,5 @@
"RelevanceEvaluator",
"SimilarityEvaluator",
"QAEvaluator",
"ChatEvaluator",
]
205 changes: 205 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore

from promptflow.entities import AzureOpenAIConnection
from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import logging
import numpy as np


logger = logging.getLogger(__name__)


class ChatEvaluator:
def __init__(
self,
model_config: AzureOpenAIConnection,
deployment_name: str,
eval_last_turn: bool = False,
parallel: bool = True):
"""
Initialize an evaluator configured for a specific Azure OpenAI model.

:param model_config: Configuration for the Azure OpenAI model.
:type model_config: AzureOpenAIConnection
:param deployment_name: Deployment to be used which has Azure OpenAI model.
:type deployment_name: AzureOpenAIConnection
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
:type eval_last_turn: bool
:param parallel: If True, use parallel execution for evaluators. Else, use sequential execution. Default is True.
:type parallel: bool
:return: A function that evaluates and generates metrics for "chat" scenario.
:rtype: function

**Usage**

.. code-block:: python

eval_fn = ChatEvaluator(model_config, deployment_name="gpt-4")
conversation = [
{"role": "user", "content": "What is the value of 2 + 2?"},
{"role": "assistant", "content": "2 + 2 = 4", "context": {
"citations": [{"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]}}
]
result = chat_eval(conversation=conversation)
"""
self._eval_last_turn = eval_last_turn
self._parallel = parallel

# TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
self._rag_evaluators = [
GroundednessEvaluator(model_config, deployment_name=deployment_name),
RelevanceEvaluator(model_config, deployment_name=deployment_name),
]
self._non_rag_evaluators = [
CoherenceEvaluator(model_config, deployment_name=deployment_name),
FluencyEvaluator(model_config, deployment_name=deployment_name),
]

def __call__(self, *, conversation: List[Dict], **kwargs):
"""Evaluates chat scenario.

:param conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
"context" key is optional for assistant's turn and should have "citations" key with list of citations.
ninghu marked this conversation as resolved.
Show resolved Hide resolved
:type conversation: List[Dict]
:return: The scores for Chat scenario.
:rtype: dict
"""

self._validate_conversation(conversation)

# Extract questions, answers and contexts from conversation
questions = []
answers = []
contexts = []

if self._eval_last_turn:
# Process only the last two turns if _eval_last_turn is True
conversation_slice = conversation[-2:] if len(conversation) >= 2 else conversation
else:
conversation_slice = conversation

for each_turn in conversation_slice:
role = each_turn["role"]
if role == "user":
questions.append(each_turn["content"])
elif role == "assistant":
answers.append(each_turn["content"])
if "context" in each_turn and "citations" in each_turn["context"]:
citations = json.dumps(each_turn["context"]["citations"])
contexts.append(citations)

# Select evaluators to be used for evaluation
compute_rag_based_metrics = True
if len(answers) != len(contexts):
ninghu marked this conversation as resolved.
Show resolved Hide resolved
safe_message = "Skipping rag based metrics as we need citations or " \
"retrieved_documents in context key of every assistant's turn"
logger.warning(safe_message)
compute_rag_based_metrics = False

selected_evaluators = []
selected_evaluators.extend(self._non_rag_evaluators)
if compute_rag_based_metrics:
selected_evaluators.extend(self._rag_evaluators)
ninghu marked this conversation as resolved.
Show resolved Hide resolved

# Evaluate each turn
per_turn_results = []
for turn_num in range(len(questions)):
current_turn_result = {}

if self._parallel:
# Parallel execution
with ThreadPoolExecutor() as executor:
future_to_evaluator = {
executor.submit(self._evaluate_turn, turn_num, questions, answers, contexts, evaluator): evaluator
for evaluator in selected_evaluators
}

for future in as_completed(future_to_evaluator):
score = future.result()
current_turn_result.update(score)
else:
# Sequential execution
for evaluator in selected_evaluators:
score = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
current_turn_result.update(score)

per_turn_results.append(current_turn_result)

# Aggregate results
# Final aggregated results for a conversation will look like:
# {
# "gpt_groundedness": 0.9,
# "gpt_groundedness_per_turn": [0.9, 0.8, 0.9, ...],
# ...
# }
aggregated = {}
for key in per_turn_results[0].keys():
values = [d[key] for d in per_turn_results]
aggregated[key] = np.nanmean(values)
aggregated[key + "_per_turn"] = values

return aggregated

def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
try:
question = questions[turn_num] if turn_num < len(questions) else ""
answer = answers[turn_num] if turn_num < len(answers) else ""
context = contexts[turn_num] if turn_num < len(contexts) else ""

score = evaluator(
ninghu marked this conversation as resolved.
Show resolved Hide resolved
question=question,
answer=answer,
context=context)

return score
except Exception as e:
logger.warning(f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
return {}

def _validate_conversation(self, conversation: List[Dict]):
if conversation is None or not isinstance(conversation, list):
raise ValueError("'conversation' must be a list of dictionaries.")

expected_role = "user"
for turn_num, turn in enumerate(conversation):
one_based_turn_num = turn_num + 1

if not isinstance(turn, dict):
raise ValueError(f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}")

if "role" not in turn or "content" not in turn:
raise ValueError(f"Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: {one_based_turn_num}")

if turn["role"] != expected_role:
raise ValueError(f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}")

if not isinstance(turn["content"], str):
raise ValueError(f"Content in each turn must be a string. Turn number: {one_based_turn_num}")

if turn["role"] == "assistant" and "context" in turn:
if not isinstance(turn["context"], dict):
raise ValueError(f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}")

if "citations" not in turn["context"]:
raise ValueError(f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}")

if not isinstance(turn["context"]["citations"], list):
raise ValueError(f"'citations' in context must be a list. Turn number: {one_based_turn_num}")

for citation_num, citation in enumerate(turn["context"]["citations"]):
if not isinstance(citation, dict):
raise ValueError(f"Each citation in 'citations' must be a dictionary. Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}")

# Toggle expected role for the next turn
expected_role = "user" if expected_role == "assistant" else "assistant"

# Ensure the conversation ends with an assistant's turn
if expected_role != "user":
raise ValueError("The conversation must end with an assistant's turn.")
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator




class QAEvaluator:
def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
"""
Expand All @@ -26,7 +24,7 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):

.. code-block:: python

eval_fn = qa.init(model_config, deployment_name="gpt-4")
eval_fn = QAEvaluator(model_config, deployment_name="gpt-4")
result = qa_eval(
question="Tokyo is the capital of which country?",
answer="Japan",
Expand All @@ -44,7 +42,7 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
]

def __call__(self, *, question: str, answer: str, context: str, ground_truth: str, **kwargs):
"""Evaluate similarity.
"""Evaluates question-answering scenario.

:param question: The question to be evaluated.
:type question: str
Expand Down
20 changes: 19 additions & 1 deletion src/promptflow-evals/samples/built_in_evaluators.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os
from promptflow.entities import AzureOpenAIConnection
from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator
from promptflow.evals.evaluators import QAEvaluator
from promptflow.evals.evaluators.content_safety import ViolenceEvaluator, SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator
from promptflow.evals.evaluators import QAEvaluator, ChatEvaluator
from azure.identity import DefaultAzureCredential


Expand Down Expand Up @@ -123,10 +123,28 @@ def run_qa_evaluator():
# {'gpt_groundedness': 1.0, 'gpt_relevance': 5.0, 'gpt_coherence': 5.0, 'gpt_fluency': 5.0, 'gpt_similarity': 5.0, 'f1_score': 1.0}


def run_chat_evaluator():
chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")

conversation = [
{"role": "user", "content": "What is the value of 2 + 2?"},
{"role": "assistant", "content": "2 + 2 = 4", "context":{"citations": [{"id": "doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]}},
{"role": "user", "content": "What is the capital of Japan?"},
{"role": "assistant", "content": "The capital of Japan is Tokyo.", "context":{"citations": [{"id": "doc.md", "content": "Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements."}]}},
]
score = chat_eval(conversation=conversation)
print(score)
# {'gpt_coherence': 5.0, 'gpt_coherence_per_turn': [5.0, 5.0], 'gpt_fluency': 5.0, 'gpt_fluency_per_turn': [5.0, 5.0], 'gpt_groundedness': 5.0, 'gpt_groundedness_per_turn': [5.0, 5.0], 'gpt_relevance': 5.0, 'gpt_relevance_per_turn': [5.0, 5.0]}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this be in same order as conversation ? @qusongms how does UI show the per turn score right now ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it will be in the same order of the conversation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ninghu are we giving a reason as well for the score ? For chat we do see reasons being presented



if __name__ == "__main__":

# Individual evaluators
run_quality_evaluators()

run_safety_evaluators()

# Composite evaluators
run_qa_evaluator()

run_chat_evaluator()
88 changes: 88 additions & 0 deletions src/promptflow-evals/tests/unittests/test_chat_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pytest
from unittest.mock import patch, Mock
from promptflow.evals.evaluators import ChatEvaluator
from promptflow.entities import AzureOpenAIConnection



class TestChatEvaluator:
def test_conversation_validation_normal(self):
model_config = AzureOpenAIConnection(
api_base="mocked_endpoint",
api_key="mocked_key",
api_type="azure",
)

conversation = [
{"role": "user", "content": "What is the value of 2 + 2?"},
{"role": "assistant", "content": "2 + 2 = 4", "context":{"citations": [{"id": "doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]}},
{"role": "user", "content": "What is the capital of Japan?"},
{"role": "assistant", "content": "The capital of Japan is Tokyo.", "context":{"citations": [{"id": "doc.md", "content": "Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements."}]}},
]

chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
chat_eval._non_rag_evaluators = []
chat_eval._rag_evaluators = []

chat_eval(conversation=conversation)

def test_conversation_validation_missing_role(self):
model_config = AzureOpenAIConnection(
api_base="mocked_endpoint",
api_key="mocked_key",
api_type="azure",
)

conversation = [
{"role": "user", "content": "question 1"},
{"content": "answer 1"},
]

chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
chat_eval._non_rag_evaluators = []
chat_eval._rag_evaluators = []

with pytest.raises(ValueError) as e:
chat_eval(conversation=conversation)
assert str(e.value) == "Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: 2"

def test_conversation_validation_question_answer_not_paired(self):
model_config = AzureOpenAIConnection(
api_base="mocked_endpoint",
api_key="mocked_key",
api_type="azure",
)

conversation = [
{"role": "user", "content": "question 1"},
{"role": "assistant", "content": "answer 1"},
{"role": "assistant", "content": "answer 2"},
]

chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
chat_eval._non_rag_evaluators = []
chat_eval._rag_evaluators = []

with pytest.raises(ValueError) as e:
chat_eval(conversation=conversation)
assert str(e.value) == "Expected role user but got assistant. Turn number: 3"

def test_conversation_validation_invalid_citations(self):
model_config = AzureOpenAIConnection(
api_base="mocked_endpoint",
api_key="mocked_key",
api_type="azure",
)

conversation = [
{"role": "user", "content": "question 1"},
{"role": "assistant", "content": "answer 1", "context": {"citations": "invalid"}},
]

chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
chat_eval._non_rag_evaluators = []
chat_eval._rag_evaluators = []

with pytest.raises(ValueError) as e:
chat_eval(conversation=conversation)
assert str(e.value) == "'citations' in context must be a list. Turn number: 2"
Loading