Add code example

microsoft · Apr 19, 2024 · a801282 · a801282
1 parent feb6a12
commit a801282
Showing 1 changed file with 111 additions and 17 deletions.
diff --git a/src/promptflow-evals/README.md b/src/promptflow-evals/README.md
@@ -4,20 +4,114 @@
 [![License: MIT](https://img.shields.io/github/license/microsoft/promptflow)](https://github.com/microsoft/promptflow/blob/main/LICENSE)
 
 ## Introduction
-Evaluators are prebuilt promptflow pipelines that are designed to measure the quality of the outputs from large language models.
-The package includes
- - F1 score evaluator.
-   The F1 score evaluator computes the F1 score based on the actual and predicted answer. 
- - Chat evaluator. Chat evaluator is an ensemble of other evaluators. It accepts the list of dialog turns, which include questions, answers and contexts and apply the evaluators. It calculates coherence and fluency of answers and if all the data points have context, the RAG-based metrics are being calculated by groundness and relevance evaluators.
- - Coherence evaluator
-   This evaluator calculates coherence of an answer, which is measured by how well all the sentences fit together and sound naturally as a whole.
- - Fluency evaluator.
-   Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. 
- - Groundness evaluator. The Groundness evaluator is being applied if the context is provided. It estimates the integer score between 1 and 5 measuring, how logically answer is following from context, where one is a false statement, and five is true statement.
- - QA evaluator is an ensemble of evaluator, which calculates groundness, relevance, coherence, fluency, similarity and F1 score of question and answer.
- - Relevance evaluator. Relevance measures how well the answer addresses the main aspects of the question, based on the context. This evaluator also return integer value from 1 to 5 where 1 means that the answer completely lacks the relevance and 5 means that the relevance is perfect.
- - Similarity evaluator measures similarity between the predicted answer and the correct answer using nubers from 1 to 5, where one means no similarity and five absolute similarity.
- - Content safety evaluators score the answer received from model based on presence of inappropriate contents. Evaluators package includes three content safety evaluators.
-   * Self harm evaluator
-   * Hate/unfairness evaluator
-   * Sexual evaluator
+Evaluators are prebuilt promptflow pipelines that are designed to measure the quality of the outputs from language models.
+
+## Usage
+Users can create run evaluators on the local machine as shown in the example below:
+
+```python
+import os
+from pprint import pprint
+
+from promptflow.core import AzureOpenAIModelConfiguration
+from promptflow.evals.evaluate import evaluate
+from promptflow.evals.evaluators import RelevanceEvaluator
+from promptflow.evals.evaluators.content_safety import ViolenceEvaluator
+
+
+def built_in_evaluator():
+    # Initialize Azure OpenAI Model Configuration
+    model_config = AzureOpenAIModelConfiguration(
+        azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
+        api_key=os.environ.get("AZURE_OPENAI_KEY"),
+        azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
+    )
+
+    # Initialzing Relevance Evaluator
+    relevance_eval = RelevanceEvaluator(model_config)
+
+    # Running Relevance Evaluator on single input row
+    relevance_score = relevance_eval(
+        answer="The Alpine Explorer Tent is the most waterproof.",
+        context="From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining "
+        "Table has higher weight.",
+    )
+
+    pprint(relevance_score)
+
+
+def content_safety_evaluator():
+    # Initialize Project Scope
+    project_scope = {
+        "subscription_id": "e0fd569c-e34a-4249-8c24-e8d723c7f054",
+        "resource_group_name": "rg-test",
+        "project_name": "project-test",
+    }
+
+    violence_eval = ViolenceEvaluator(project_scope)
+    violence_score = violence_eval(question="What is the capital of France?", answer="Paris.")
+    pprint(violence_score)
+
+
+def answer_length(answer, **kwargs):
+    return {"value": len(answer)}
+
+
+if __name__ == "__main__":
+    # Built-in evaluators
+    # Initialize Azure OpenAI Connection
+    model_config = AzureOpenAIModelConfiguration(
+        azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
+        api_key=os.environ.get("AZURE_OPENAI_KEY"),
+        azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
+    )
+
+    # Initialzing Relevance Evaluator
+    relevance_eval = RelevanceEvaluator(model_config)
+
+    # Running Relevance Evaluator on single input row
+    relevance_score = relevance_eval(
+        answer="The Alpine Explorer Tent is the most waterproof.",
+        context="From the our product list,"
+        " the alpine explorer tent is the most waterproof."
+        " The Adventure Dining Table has higher weight.",
+        question="Which tent is the most waterproof?",
+    )
+
+    pprint(relevance_score)
+    # {'gpt_relevance': 5.0}
+
+    # Content Safety Evaluator
+
+    # Initialize Project Scope
+    project_scope = {
+        "subscription_id": "e0fd569c-e34a-4249-8c24-e8d723c7f054",
+        "resource_group_name": "rg-test",
+        "project_name": "project-test",
+    }
+
+    violence_eval = ViolenceEvaluator(project_scope)
+    violence_score = violence_eval(question="What is the capital of France?", answer="Paris.")
+    pprint(violence_score)
+    # {'violence': 'Safe',
+    # 'violence_reason': "The system's response is a straightforward factual answer "
+    #                    'to a geography question. There is no violent content or '
+    #                    'language present.',
+    # 'violence_score': 0}
+
+    # Code based evaluator
+    answer_length("The Alpine Explorer Tent is the most waterproof.")
+    # {'value': 48}
+
+    # Using multiple evaluators together using `Evaluate` API
+
+    result = evaluate(
+        data="evaluate_test_data.jsonl",
+        evaluators={
+            "answer_length": answer_length,
+            "violence": violence_eval,
+        },
+    )
+
+    pprint(result)
+```