Skip to content

Commit

Permalink
Merge branch 'main' into patch-2
Browse files Browse the repository at this point in the history
  • Loading branch information
hyoshioka0128 authored Sep 19, 2024
2 parents c82b1c9 + 3cde352 commit f3d998f
Show file tree
Hide file tree
Showing 47 changed files with 1,232,352 additions and 146 deletions.
105 changes: 57 additions & 48 deletions .cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,65 +54,67 @@
"benchmark/promptflow-serve/result-archive/**"
],
"words": [
"aoai",
"amlignore",
"mldesigner",
"faiss",
"serp",
"aoai",
"Apim",
"astext",
"attribited",
"azureai",
"azurecr",
"azureml",
"mlflow",
"vnet",
"openai",
"pfazure",
"azuremlsdktestpypi",
"Bhavik",
"centralus",
"chatml",
"cref",
"e2etest",
"e2etests",
"eastus",
"azureai",
"vectordb",
"Qdrant",
"Weaviate",
"Entra",
"env",
"e2etests",
"e2etest",
"tablefmt",
"logprobs",
"logit",
"faiss",
"geval",
"hnsw",
"chatml",
"UNLCK",
"junit",
"KHTML",
"Likert",
"llmlingua",
"logit",
"logprobs",
"meid",
"mgmt",
"MistralAI",
"mldesigner",
"mlflow",
"msal",
"msrest",
"myconn",
"numlines",
"azurecr",
"centralus",
"nunit",
"openai",
"pfazure",
"pfbytes",
"pfcli",
"pfutil",
"Policheck",
"azuremlsdktestpypi",
"rediraffe",
"pydata",
"ROBOCOPY",
"undoc",
"Qdrant",
"rediraffe",
"retriable",
"pfcli",
"pfutil",
"mgmt",
"wsid",
"westus",
"msrest",
"cref",
"msal",
"pfbytes",
"Apim",
"junit",
"nunit",
"astext",
"Likert",
"geval",
"ROBOCOPY",
"serp",
"Summ",
"Bhavik",
"meid",
"Entra",
"tablefmt",
"undoc",
"UNLCK",
"upia",
"uvicorn",
"attribited",
"MistralAI",
"llmlingua",
"myconn"
"vectordb",
"vnet",
"Weaviate",
"westus",
"wsid",
"Xpia"
],
"ignoreWords": [
"openmpi",
Expand Down Expand Up @@ -242,13 +244,20 @@
"azureopenaimodelconfiguration",
"openaimodelconfiguration",
"usecwd",
"upia",
"xpia",
"locustio",
"euap",
"Rerank",
"rerank",
"reranker",
"rcfile",
"pylintrc"
"pylintrc",
"gleu",
"Gleu",
"GLEU",
"fmeasure",
"punkt"
],
"flagWords": [
"Prompt Flow"
Expand Down
6 changes: 0 additions & 6 deletions docs/cloud/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,6 @@ In prompt flow, You can develop your flow locally and then seamlessly transition

For more resources on Azure AI, visit the cloud documentation site: [Build AI solutions with prompt flow](https://learn.microsoft.com/en-us/azure/machine-learning/prompt-flow/get-started-prompt-flow?view=azureml-api-2).

```{toctree}
:caption: Tracing
:maxdepth: 2
azureai/tracing/index
```

```{toctree}
:caption: Flow
:maxdepth: 2
Expand Down
4 changes: 0 additions & 4 deletions docs/reference/pf-command-reference.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
# pf

:::{admonition} Experimental feature
This is an experimental feature, and may change at any time. Learn [more](../how-to-guides/faq.md#stable-vs-experimental).
:::

Manage prompt flow resources with the prompt flow CLI.

| Command | Description |
Expand Down
10 changes: 9 additions & 1 deletion src/promptflow-evals/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,21 @@

## v0.3.3 (Upcoming)
### Features Added
- Add a new evaluator (ProtectedMaterialsEvaluator) and associated adversarial content simulator enum type (AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL) for protected materials, which determines if given inputs contain materials protected by IP laws.
- Introduced `IndirectAttackSimulator` to simulate XPIA (cross domain prompt injected attack) jailbreak attacks on your AI system.
- Introduced `IndirectAttackEvaluator` to evaluate content for the presence of XPIA (cross domain prompt injected attacks) injected into conversation or Q/A context to interrupt normal expected functionality by eliciting manipulated content, intrusion and attempting to gather information outside the scope of your AI system.
- Add a new evaluator (ProtectedMaterialEvaluator) and associated adversarial content simulator enum type (AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL) for protected material, which determines if given inputs contain material protected by IP laws.
- Added four mathematic evaluators, `BleuScoreEvaluator`, `GleuScoreEvaluator`, `MeteorScoreEvaluator` and `RougeScoreEvaluator` - for evaluating the quality of generated text by comparing it against referenced text.

### Bugs Fixed
- Fixed evaluators to accept (non-Azure) Open AI Configs.

### Breaking Changes
- Replaced `jailbreak` parameter in `AdversarialSimulator` with `_jailbreak_type` parameter to support multiple jailbreak types. Instead of editing this parameter directly, we recommend using the `JailbreakAdversarialSimulator` class for UPIA jailbreak and `IndirectAttackSimulator` class for XPIA jailbreak.

### Improvements
- Renamed `JailbreakAdversarialSimulator` to `DirectAttackSimulator`
- Set the PF_EVALS_BATCH_USE_ASYNC environment variable to True by default to enable asynchronous batch run for async-enabled built-in evaluators, improving performance.
- The `AdversarialSimulator` class now supports randomization of simulation prompts, as well as seeding of said randomization for consistency via two new arguments: `randomize_order` and `randomization_seed`. Randomization is enabled by default. The child class `DirectAttackSimulator` always uses a `randomization_seed` to synchronize sub-simulators, even if none is provided.

## v0.3.2 (2024-08-13)
### Features Added
Expand Down
2 changes: 2 additions & 0 deletions src/promptflow-evals/promptflow/evals/_common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Tasks:

CONTENT_HARM = "content harm"
PROTECTED_MATERIAL = "protected material"
XPIA = "xpia"


class _InternalAnnotationTasks:
Expand All @@ -52,6 +53,7 @@ class EvaluationMetrics:
SELF_HARM = "self_harm"
SEXUAL = "sexual"
PROTECTED_MATERIAL = "protected_material"
XPIA = "xpia"


class _InternalEvaluationMetrics:
Expand Down
26 changes: 23 additions & 3 deletions src/promptflow-evals/promptflow/evals/_common/rai_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
elif metric == _InternalEvaluationMetrics.ECI:
task = _InternalAnnotationTasks.ECI
include_metric = False
elif metric == EvaluationMetrics.XPIA:
task = Tasks.XPIA
include_metric = False
return (
{
"UserTextList": [normalized_user_text],
Expand Down Expand Up @@ -208,16 +211,33 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
:rtype: List[List[Dict]]
"""

if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
# non-numeric metrics
if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
return {}
response = batch_response[0][metric_name]
response = response.replace("false", "False")
response = response.replace("true", "True")
parsed_response = literal_eval(response)
result = {}
result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
# Use label instead of score since these are assumed to be boolean results.
# Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""

if metric_name == EvaluationMetrics.XPIA:
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
# if present else set them to np.nan
result[metric_name + "_manipulated_content"] = (
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
)
result[metric_name + "_intrusion"] = (
parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
)
result[metric_name + "_information_gathering"] = (
parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
)

return result
return _parse_content_harm_response(batch_response, metric_name)

Expand Down
28 changes: 28 additions & 0 deletions src/promptflow-evals/promptflow/evals/_common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,20 @@
from . import constants
except ImportError:
import constants

from typing import List, cast

import nltk
import numpy as np

try:
from nltk.tokenize.nist import NISTTokenizer
except LookupError:
nltk.download("perluniprops")
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize.nist import NISTTokenizer


def get_harm_severity_level(harm_score: int) -> str:
"""Generate harm severity level based on harm score.
Expand All @@ -28,3 +40,19 @@ def get_harm_severity_level(harm_score: int) -> str:
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
return harm_level.value
return np.nan


def nltk_tokenize(text: str) -> List[str]:
"""Tokenize the input text using the NLTK tokenizer."""

is_latin_or_numeric = all(
("\u0020" <= c <= "\u007E") # Basic Latin
or ("\u00A0" <= c <= "\u00FF") # Latin-1 Supplement
or ("0" <= c <= "9") # Digits
for c in text
)

if is_latin_or_numeric:
return cast(List[str], nltk.word_tokenize(text))

return list(NISTTokenizer().international_tokenize(text))
11 changes: 11 additions & 0 deletions src/promptflow-evals/promptflow/evals/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,17 @@ class EvaluationMetrics:
VIOLENCE = "violence"
SELF_HARM = "self_harm"
SEXUAL = "sexual"
PROTECTED_MATERIAL = "protected_material"
XPIA = "xpia"


class _InternalEvaluationMetrics:
"""Evaluation metrics that are not publicly supported.
These metrics are experimental and subject to potential change or migration to the main
enum over time.
"""

ECI = "eci"


class Prefixes:
Expand Down
Loading

0 comments on commit f3d998f

Please sign in to comment.