microsoft · ninghu · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/__init__.py
@@ -12,6 +12,7 @@
 from .relevance import RelevanceEvaluator
 from .similarity import SimilarityEvaluator
 from .qa import QAEvaluator
+from .chat import ChatEvaluator
 
 
 __all__ = [
@@ -22,4 +23,5 @@
     "RelevanceEvaluator",
     "SimilarityEvaluator",
     "QAEvaluator",
+    "ChatEvaluator",
 ]
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
@@ -0,0 +1,205 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
+
+from promptflow.entities import AzureOpenAIConnection
+from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator
+from typing import List, Dict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import json
+import logging
+import numpy as np
+
+
+logger = logging.getLogger(__name__)
+
+
+class ChatEvaluator:
+    def __init__(
+            self,
+            model_config: AzureOpenAIConnection,
+            deployment_name: str,
+            eval_last_turn: bool = False,
+            parallel: bool = True):
+        """
+        Initialize an evaluator configured for a specific Azure OpenAI model.
+
+        :param model_config: Configuration for the Azure OpenAI model.
+        :type model_config: AzureOpenAIConnection
+        :param deployment_name: Deployment to be used which has Azure OpenAI model.
+        :type deployment_name: AzureOpenAIConnection
+        :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
+        :type eval_last_turn: bool
+        :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution. Default is True.
+        :type parallel: bool
+        :return: A function that evaluates and generates metrics for "chat" scenario.
+        :rtype: function
+
+        **Usage**
+
+        .. code-block:: python
+
+            eval_fn = ChatEvaluator(model_config, deployment_name="gpt-4")
+            conversation = [
+                {"role": "user", "content": "What is the value of 2 + 2?"},
+                {"role": "assistant", "content": "2 + 2 = 4", "context": {
+                    "citations": [{"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]}}
+            ]
+            result = chat_eval(conversation=conversation)
+        """
+        self._eval_last_turn = eval_last_turn
+        self._parallel = parallel
+
+        # TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
+        self._rag_evaluators  = [
+            GroundednessEvaluator(model_config, deployment_name=deployment_name),
+            RelevanceEvaluator(model_config, deployment_name=deployment_name),
+        ]
+        self._non_rag_evaluators = [
+            CoherenceEvaluator(model_config, deployment_name=deployment_name),
+            FluencyEvaluator(model_config, deployment_name=deployment_name),
+        ]
+
+    def __call__(self, *, conversation: List[Dict], **kwargs):
+        """Evaluates chat scenario.
+
+        :param conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
+                             "context" key is optional for assistant's turn and should have "citations" key with list of citations.
+        :type conversation: List[Dict]
+        :return: The scores for Chat scenario.
+        :rtype: dict
+        """
+
+        self._validate_conversation(conversation)
+
+        # Extract questions, answers and contexts from conversation
+        questions = []
+        answers = []
+        contexts = []
+
+        if self._eval_last_turn:
+            # Process only the last two turns if _eval_last_turn is True
+            conversation_slice = conversation[-2:] if len(conversation) >= 2 else conversation
+        else:
+            conversation_slice = conversation
+
+        for each_turn in conversation_slice:
+            role = each_turn["role"]
+            if role == "user":
+                questions.append(each_turn["content"])
+            elif role == "assistant":
+                answers.append(each_turn["content"])
+                if "context" in each_turn and "citations" in each_turn["context"]:
+                    citations = json.dumps(each_turn["context"]["citations"])
+                    contexts.append(citations)
+
+        # Select evaluators to be used for evaluation
+        compute_rag_based_metrics = True
+        if len(answers) != len(contexts):
+            safe_message = "Skipping rag based metrics as we need citations or " \
+                            "retrieved_documents in context key of every assistant's turn"
+            logger.warning(safe_message)
+            compute_rag_based_metrics = False
+
+        selected_evaluators = []
+        selected_evaluators.extend(self._non_rag_evaluators)
+        if compute_rag_based_metrics:
+            selected_evaluators.extend(self._rag_evaluators)
+
+        # Evaluate each turn
+        per_turn_results = []
+        for turn_num in range(len(questions)):
+            current_turn_result = {}
+
+            if self._parallel:
+                # Parallel execution
+                with ThreadPoolExecutor() as executor:
+                    future_to_evaluator = {
+                        executor.submit(self._evaluate_turn, turn_num, questions, answers, contexts, evaluator): evaluator
+                        for evaluator in selected_evaluators
+                    }
+
+                    for future in as_completed(future_to_evaluator):
+                        score = future.result()
+                        current_turn_result.update(score)
+            else:
+                # Sequential execution
+                for evaluator in selected_evaluators:
+                    score = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
+                    current_turn_result.update(score)
+
+            per_turn_results.append(current_turn_result)
+
+        # Aggregate results
+        # Final aggregated results for a conversation will look like:
+        # {
+        #     "gpt_groundedness": 0.9,
+        #     "gpt_groundedness_per_turn": [0.9, 0.8, 0.9, ...],
+        #     ...
+        # }
+        aggregated = {}
+        for key in per_turn_results[0].keys():
+            values = [d[key] for d in per_turn_results]
+            aggregated[key] = np.nanmean(values)
+            aggregated[key + "_per_turn"] = values
+
+        return aggregated
+
+    def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
+        try:
+            question = questions[turn_num] if turn_num < len(questions) else ""
+            answer = answers[turn_num] if turn_num < len(answers) else ""
+            context = contexts[turn_num] if turn_num < len(contexts) else ""
+
+            score = evaluator(
+                question=question,
+                answer=answer,
+                context=context)
+
+            return score
+        except Exception as e:
+            logger.warning(f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
+            return {}
+
+    def _validate_conversation(self, conversation: List[Dict]):
+        if conversation is None or not isinstance(conversation, list):
+            raise ValueError("'conversation' must be a list of dictionaries.")
+
+        expected_role = "user"
+        for turn_num, turn in enumerate(conversation):
+            one_based_turn_num = turn_num + 1
+
+            if not isinstance(turn, dict):
+                raise ValueError(f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}")
+
+            if "role" not in turn or "content" not in turn:
+                raise ValueError(f"Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: {one_based_turn_num}")
+
+            if turn["role"] != expected_role:
+                raise ValueError(f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}")
+
+            if not isinstance(turn["content"], str):
+                raise ValueError(f"Content in each turn must be a string. Turn number: {one_based_turn_num}")
+
+            if turn["role"] == "assistant" and "context" in turn:
+                if not isinstance(turn["context"], dict):
+                    raise ValueError(f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}")
+
+                if "citations" not in turn["context"]:
+                    raise ValueError(f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}")
+
+                if not isinstance(turn["context"]["citations"], list):
+                    raise ValueError(f"'citations' in context must be a list. Turn number: {one_based_turn_num}")
+
+                for citation_num, citation in enumerate(turn["context"]["citations"]):
+                    if not isinstance(citation, dict):
+                        raise ValueError(f"Each citation in 'citations' must be a dictionary. Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}")
+
+            # Toggle expected role for the next turn
+            expected_role = "user" if expected_role == "assistant" else "assistant"
+
+        # Ensure the conversation ends with an assistant's turn
+        if expected_role != "user":
+            raise ValueError("The conversation must end with an assistant's turn.")
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py
@@ -8,8 +8,6 @@
 from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator
 
 
-
-
 class QAEvaluator:
     def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
         """
@@ -26,7 +24,7 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
 
         .. code-block:: python
 
-            eval_fn = qa.init(model_config, deployment_name="gpt-4")
+            eval_fn = QAEvaluator(model_config, deployment_name="gpt-4")
             result = qa_eval(
                 question="Tokyo is the capital of which country?",
                 answer="Japan",
@@ -44,7 +42,7 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str):
         ]
 
     def __call__(self, *, question: str, answer: str, context: str, ground_truth: str, **kwargs):
-        """Evaluate similarity.
+        """Evaluates question-answering scenario.
 
         :param question: The question to be evaluated.
         :type question: str

diff --git a/src/promptflow-evals/samples/built_in_evaluators.py b/src/promptflow-evals/samples/built_in_evaluators.py
@@ -1,8 +1,8 @@
 import os
 from promptflow.entities import AzureOpenAIConnection
 from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator
-from promptflow.evals.evaluators import QAEvaluator
 from promptflow.evals.evaluators.content_safety import ViolenceEvaluator, SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator
+from promptflow.evals.evaluators import QAEvaluator, ChatEvaluator
 from azure.identity import DefaultAzureCredential
 
 
@@ -123,10 +123,28 @@ def run_qa_evaluator():
     # {'gpt_groundedness': 1.0, 'gpt_relevance': 5.0, 'gpt_coherence': 5.0, 'gpt_fluency': 5.0, 'gpt_similarity': 5.0, 'f1_score': 1.0}
 
 
+def run_chat_evaluator():
+    chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
+
+    conversation = [
+        {"role": "user", "content": "What is the value of 2 + 2?"},
+        {"role": "assistant", "content": "2 + 2 = 4", "context":{"citations": [{"id": "doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]}},
+        {"role": "user", "content": "What is the capital of Japan?"},
+        {"role": "assistant", "content": "The capital of Japan is Tokyo.", "context":{"citations": [{"id": "doc.md", "content": "Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements."}]}},
+    ]
+    score = chat_eval(conversation=conversation)
+    print(score)
+    # {'gpt_coherence': 5.0, 'gpt_coherence_per_turn': [5.0, 5.0], 'gpt_fluency': 5.0, 'gpt_fluency_per_turn': [5.0, 5.0], 'gpt_groundedness': 5.0, 'gpt_groundedness_per_turn': [5.0, 5.0], 'gpt_relevance': 5.0, 'gpt_relevance_per_turn': [5.0, 5.0]}
+
+
 if __name__ == "__main__":
 
+    # Individual evaluators
     run_quality_evaluators()
 
     run_safety_evaluators()
 
+    # Composite evaluators
     run_qa_evaluator()
+
+    run_chat_evaluator()
diff --git a/src/promptflow-evals/tests/unittests/test_chat_evaluator.py b/src/promptflow-evals/tests/unittests/test_chat_evaluator.py
@@ -0,0 +1,88 @@
+import pytest
+from unittest.mock import patch, Mock
+from promptflow.evals.evaluators import ChatEvaluator
+from promptflow.entities import AzureOpenAIConnection
+
+
+
+class TestChatEvaluator:
+    def test_conversation_validation_normal(self):
+        model_config = AzureOpenAIConnection(
+            api_base="mocked_endpoint",
+            api_key="mocked_key",
+            api_type="azure",
+        )
+
+        conversation = [
+            {"role": "user", "content": "What is the value of 2 + 2?"},
+            {"role": "assistant", "content": "2 + 2 = 4", "context":{"citations": [{"id": "doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]}},
+            {"role": "user", "content": "What is the capital of Japan?"},
+            {"role": "assistant", "content": "The capital of Japan is Tokyo.", "context":{"citations": [{"id": "doc.md", "content": "Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements."}]}},
+        ]
+
+        chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
+        chat_eval._non_rag_evaluators = []
+        chat_eval._rag_evaluators = []
+
+        chat_eval(conversation=conversation)
+
+    def test_conversation_validation_missing_role(self):
+        model_config = AzureOpenAIConnection(
+            api_base="mocked_endpoint",
+            api_key="mocked_key",
+            api_type="azure",
+        )
+
+        conversation = [
+            {"role": "user", "content": "question 1"},
+            {"content": "answer 1"},
+        ]
+
+        chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
+        chat_eval._non_rag_evaluators = []
+        chat_eval._rag_evaluators = []
+
+        with pytest.raises(ValueError) as e:
+            chat_eval(conversation=conversation)
+        assert str(e.value) == "Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: 2"
+
+    def test_conversation_validation_question_answer_not_paired(self):
+        model_config = AzureOpenAIConnection(
+            api_base="mocked_endpoint",
+            api_key="mocked_key",
+            api_type="azure",
+        )
+
+        conversation = [
+            {"role": "user", "content": "question 1"},
+            {"role": "assistant", "content": "answer 1"},
+            {"role": "assistant", "content": "answer 2"},
+        ]
+
+        chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
+        chat_eval._non_rag_evaluators = []
+        chat_eval._rag_evaluators = []
+
+        with pytest.raises(ValueError) as e:
+            chat_eval(conversation=conversation)
+        assert str(e.value) == "Expected role user but got assistant. Turn number: 3"
+
+    def test_conversation_validation_invalid_citations(self):
+        model_config = AzureOpenAIConnection(
+            api_base="mocked_endpoint",
+            api_key="mocked_key",
+            api_type="azure",
+        )
+
+        conversation = [
+            {"role": "user", "content": "question 1"},
+            {"role": "assistant", "content": "answer 1", "context": {"citations": "invalid"}},
+        ]
+
+        chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
+        chat_eval._non_rag_evaluators = []
+        chat_eval._rag_evaluators = []
+
+        with pytest.raises(ValueError) as e:
+            chat_eval(conversation=conversation)
+        assert str(e.value) == "'citations' in context must be a list. Turn number: 2"