Merge branch 'main' into patch-2

microsoft · Sep 19, 2024 · f3d998f · f3d998f
2 parents c82b1c9 + 3cde352
commit f3d998f
Show file tree

Hide file tree

Showing 47 changed files with 1,232,352 additions and 146 deletions.
diff --git a/.cspell.json b/.cspell.json
@@ -54,65 +54,67 @@
     "benchmark/promptflow-serve/result-archive/**"
   ],
   "words": [
-    "aoai",
     "amlignore",
-    "mldesigner",
-    "faiss",
-    "serp",
+    "aoai",
+    "Apim",
+    "astext",
+    "attribited",
+    "azureai",
+    "azurecr",
     "azureml",
-    "mlflow",
-    "vnet",
-    "openai",
-    "pfazure",
+    "azuremlsdktestpypi",
+    "Bhavik",
+    "centralus",
+    "chatml",
+    "cref",
+    "e2etest",
+    "e2etests",
     "eastus",
-    "azureai",
-    "vectordb",
-    "Qdrant",
-    "Weaviate",
+    "Entra",
     "env",
-    "e2etests",
-    "e2etest",
-    "tablefmt",
-    "logprobs",
-    "logit",
+    "faiss",
+    "geval",
     "hnsw",
-    "chatml",
-    "UNLCK",
+    "junit",
     "KHTML",
+    "Likert",
+    "llmlingua",
+    "logit",
+    "logprobs",
+    "meid",
+    "mgmt",
+    "MistralAI",
+    "mldesigner",
+    "mlflow",
+    "msal",
+    "msrest",
+    "myconn",
     "numlines",
-    "azurecr",
-    "centralus",
+    "nunit",
+    "openai",
+    "pfazure",
+    "pfbytes",
+    "pfcli",
+    "pfutil",
     "Policheck",
-    "azuremlsdktestpypi",
-    "rediraffe",
     "pydata",
-    "ROBOCOPY",
-    "undoc",
+    "Qdrant",
+    "rediraffe",
     "retriable",
-    "pfcli",
-    "pfutil",
-    "mgmt",
-    "wsid",
-    "westus",
-    "msrest",
-    "cref",
-    "msal",
-    "pfbytes",
-    "Apim",
-    "junit",
-    "nunit",
-    "astext",
-    "Likert",
-    "geval",
+    "ROBOCOPY",
+    "serp",
     "Summ",
-    "Bhavik",
-    "meid",
-    "Entra",
+    "tablefmt",
+    "undoc",
+    "UNLCK",
+    "upia",
     "uvicorn",
-    "attribited",
-    "MistralAI",
-    "llmlingua",
-    "myconn"
+    "vectordb",
+    "vnet",
+    "Weaviate",
+    "westus",
+    "wsid",
+    "Xpia"
   ],
   "ignoreWords": [
     "openmpi",
@@ -242,13 +244,20 @@
     "azureopenaimodelconfiguration",
     "openaimodelconfiguration",
     "usecwd",
+    "upia",
+    "xpia",
     "locustio",
     "euap",
     "Rerank",
     "rerank",
     "reranker",
     "rcfile",
-    "pylintrc"
+    "pylintrc",
+    "gleu",
+    "Gleu",
+    "GLEU",
+    "fmeasure",
+    "punkt"
   ],
   "flagWords": [
     "Prompt Flow"

diff --git a/docs/cloud/index.md b/docs/cloud/index.md
@@ -27,12 +27,6 @@ In prompt flow, You can develop your flow locally and then seamlessly transition
 
 For more resources on Azure AI, visit the cloud documentation site: [Build AI solutions with prompt flow](https://learn.microsoft.com/en-us/azure/machine-learning/prompt-flow/get-started-prompt-flow?view=azureml-api-2).
 
-```{toctree}
-:caption: Tracing
-:maxdepth: 2
-azureai/tracing/index
-```
-
 ```{toctree}
 :caption: Flow
 :maxdepth: 2

diff --git a/docs/reference/pf-command-reference.md b/docs/reference/pf-command-reference.md
@@ -1,9 +1,5 @@
 # pf
 
-:::{admonition} Experimental feature
-This is an experimental feature, and may change at any time. Learn [more](../how-to-guides/faq.md#stable-vs-experimental).
-:::
-
 Manage prompt flow resources with the prompt flow CLI.
 
 | Command                         | Description                    |

diff --git a/src/promptflow-evals/CHANGELOG.md b/src/promptflow-evals/CHANGELOG.md
@@ -2,13 +2,21 @@
 
 ## v0.3.3 (Upcoming)
 ### Features Added
-- Add a new evaluator (ProtectedMaterialsEvaluator) and associated adversarial content simulator enum type (AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL) for protected materials, which determines if given inputs contain materials protected by IP laws.
+- Introduced `IndirectAttackSimulator` to simulate XPIA (cross domain prompt injected attack) jailbreak attacks on your AI system.
+- Introduced `IndirectAttackEvaluator` to evaluate content for the presence of XPIA (cross domain prompt injected attacks) injected into conversation or Q/A context to interrupt normal expected functionality by eliciting manipulated content, intrusion and attempting to gather information outside the scope of your AI system.
+- Add a new evaluator (ProtectedMaterialEvaluator) and associated adversarial content simulator enum type (AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL) for protected material, which determines if given inputs contain material protected by IP laws.
+- Added four mathematic evaluators, `BleuScoreEvaluator`, `GleuScoreEvaluator`, `MeteorScoreEvaluator` and `RougeScoreEvaluator` - for evaluating the quality of generated text by comparing it against referenced text.
 
 ### Bugs Fixed
 - Fixed evaluators to accept (non-Azure) Open AI Configs.
 
+### Breaking Changes
+- Replaced `jailbreak` parameter in `AdversarialSimulator` with `_jailbreak_type` parameter to support multiple jailbreak types. Instead of editing this parameter directly, we recommend using the `JailbreakAdversarialSimulator` class for UPIA jailbreak and `IndirectAttackSimulator` class for XPIA jailbreak.
+
 ### Improvements
+- Renamed `JailbreakAdversarialSimulator` to `DirectAttackSimulator`
 - Set the PF_EVALS_BATCH_USE_ASYNC environment variable to True by default to enable asynchronous batch run for async-enabled built-in evaluators, improving performance.
+- The `AdversarialSimulator` class now supports randomization of simulation prompts, as well as seeding of said randomization for consistency via two new arguments: `randomize_order` and `randomization_seed`. Randomization is enabled by default. The child class `DirectAttackSimulator` always uses a `randomization_seed` to synchronize sub-simulators, even if none is provided.
 
 ## v0.3.2 (2024-08-13)
 ### Features Added

diff --git a/src/promptflow-evals/promptflow/evals/_common/constants.py b/src/promptflow-evals/promptflow/evals/_common/constants.py
@@ -32,6 +32,7 @@ class Tasks:
 
     CONTENT_HARM = "content harm"
     PROTECTED_MATERIAL = "protected material"
+    XPIA = "xpia"
 
 
 class _InternalAnnotationTasks:
@@ -52,6 +53,7 @@ class EvaluationMetrics:
     SELF_HARM = "self_harm"
     SEXUAL = "sexual"
     PROTECTED_MATERIAL = "protected_material"
+    XPIA = "xpia"
 
 
 class _InternalEvaluationMetrics:

diff --git a/src/promptflow-evals/promptflow/evals/_common/rai_service.py b/src/promptflow-evals/promptflow/evals/_common/rai_service.py
@@ -103,6 +103,9 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
     elif metric == _InternalEvaluationMetrics.ECI:
         task = _InternalAnnotationTasks.ECI
         include_metric = False
+    elif metric == EvaluationMetrics.XPIA:
+        task = Tasks.XPIA
+        include_metric = False
     return (
         {
             "UserTextList": [normalized_user_text],
@@ -208,16 +211,33 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     :rtype: List[List[Dict]]
     """
 
-    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
+    # non-numeric metrics
+    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
         if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
             return {}
         response = batch_response[0][metric_name]
         response = response.replace("false", "False")
         response = response.replace("true", "True")
         parsed_response = literal_eval(response)
         result = {}
-        result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan
-        result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        # Use label instead of score since these are assumed to be boolean results.
+        # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
+        result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
+        result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+
+        if metric_name == EvaluationMetrics.XPIA:
+            # Add "manipulated_content", "intrusion" and "information_gathering" to the result
+            # if present else set them to np.nan
+            result[metric_name + "_manipulated_content"] = (
+                parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
+            )
+            result[metric_name + "_intrusion"] = (
+                parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
+            )
+            result[metric_name + "_information_gathering"] = (
+                parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
+            )
+
         return result
     return _parse_content_harm_response(batch_response, metric_name)
 

diff --git a/src/promptflow-evals/promptflow/evals/_common/utils.py b/src/promptflow-evals/promptflow/evals/_common/utils.py
@@ -5,8 +5,20 @@
     from . import constants
 except ImportError:
     import constants
+
+from typing import List, cast
+
+import nltk
 import numpy as np
 
+try:
+    from nltk.tokenize.nist import NISTTokenizer
+except LookupError:
+    nltk.download("perluniprops")
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
+    from nltk.tokenize.nist import NISTTokenizer
+
 
 def get_harm_severity_level(harm_score: int) -> str:
     """Generate harm severity level based on harm score.
@@ -28,3 +40,19 @@ def get_harm_severity_level(harm_score: int) -> str:
         if harm_score_range[0] <= harm_score <= harm_score_range[1]:
             return harm_level.value
     return np.nan
+
+
+def nltk_tokenize(text: str) -> List[str]:
+    """Tokenize the input text using the NLTK tokenizer."""
+
+    is_latin_or_numeric = all(
+        ("\u0020" <= c <= "\u007E")  # Basic Latin
+        or ("\u00A0" <= c <= "\u00FF")  # Latin-1 Supplement
+        or ("0" <= c <= "9")  # Digits
+        for c in text
+    )
+
+    if is_latin_or_numeric:
+        return cast(List[str], nltk.word_tokenize(text))
+
+    return list(NISTTokenizer().international_tokenize(text))
diff --git a/src/promptflow-evals/promptflow/evals/_constants.py b/src/promptflow-evals/promptflow/evals/_constants.py
@@ -18,6 +18,17 @@ class EvaluationMetrics:
     VIOLENCE = "violence"
     SELF_HARM = "self_harm"
     SEXUAL = "sexual"
+    PROTECTED_MATERIAL = "protected_material"
+    XPIA = "xpia"
+
+
+class _InternalEvaluationMetrics:
+    """Evaluation metrics that are not publicly supported.
+    These metrics are experimental and subject to potential change or migration to the main
+    enum over time.
+    """
+
+    ECI = "eci"
 
 
 class Prefixes: