-
Notifications
You must be signed in to change notification settings - Fork 901
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/xpia sim and eval fixes (#3723)
A fork of the original XPIA sim/eval branch with additional fixes for bugs discovered last night. - changes the jailbreak check for combining templates to only account for upia (since xpia doesn't merge templates) - removes conversations as an input for xpia evals until the default override bug is fixed. - Accounts for new xpia evaluator return fields. - Changes the output base name of 'reasoning' fields for label-based evaluators to just 'reason' Original PR: #3703 --------- Co-authored-by: Diondra Peck <[email protected]> Co-authored-by: Diondra <[email protected]>
- Loading branch information
1 parent
b7dc8b7
commit b04e889
Showing
20 changed files
with
136,456 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 5 additions & 0 deletions
5
src/promptflow-evals/promptflow/evals/evaluators/_xpia/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from ._xpia import IndirectAttackEvaluator | ||
|
||
__all__ = [ | ||
"IndirectAttackEvaluator", | ||
] |
133 changes: 133 additions & 0 deletions
133
src/promptflow-evals/promptflow/evals/evaluators/_xpia/_xpia.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
import logging | ||
from typing import Optional | ||
|
||
from promptflow._utils.async_utils import async_run_allowing_running_loop | ||
from promptflow.evals._common.constants import EvaluationMetrics | ||
from promptflow.evals._common.rai_service import evaluate_with_rai_service | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class IndirectAttackEvaluator: | ||
""" | ||
Initializes an XPIA (cross domain prompt injected attack) jailbreak evaluator to detect whether cross domain | ||
injected attacks are present in your AI system's response. | ||
:param project_scope: The scope of the Azure AI project. | ||
It contains subscription id, resource group, and project name. | ||
:type project_scope: dict | ||
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, | ||
focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False | ||
:type eval_last_turn: bool | ||
:param credential: The credential for connecting to Azure AI project. | ||
:type credential: ~azure.core.credentials.TokenCredential | ||
:return: A function that evaluates and generates metrics for XPIA chat scenario. Metrics include the overall | ||
evaluation label and reason for the Q/A Pair, as well as sub-labels | ||
for manipulated content, intrusion, and information. | ||
:rtype: Callable | ||
**Usage** | ||
.. code-block:: python | ||
eval_fn = IndirectAttackEvaluator(model_config) | ||
result = eval_fn(question="What is the capital of France?", answer="Paris.") | ||
**Output format for question-answer pair** | ||
.. code-block:: python | ||
{ | ||
'xpia_label': False, | ||
'xpia_reason': 'The conversation does not contain any manipulated content, intrusion or | ||
information gathering.' | ||
'xpia_information_gathering': False, | ||
'xpia_intrusion': False | ||
'xpia_manipulated_content': False | ||
} | ||
""" | ||
|
||
def __init__(self, project_scope: dict, eval_last_turn: bool = False, credential=None): | ||
self._evaluator = _IndirectAttackEvaluator(project_scope, credential) | ||
self._eval_last_turn = eval_last_turn | ||
|
||
def __call__( | ||
self, | ||
*, | ||
question: Optional[str], | ||
answer: Optional[str], | ||
**kwargs, | ||
): | ||
""" | ||
Evaluates content according to the presence of attacks injected into the conversation context to | ||
interrupt normal expected functionality by eliciting manipulated content, intrusion and attempting | ||
to gather information outside the scope of your AI system. | ||
:keyword question: The question to be evaluated. Mutually exclusive with 'conversation'. | ||
:paramtype question: Optional[str] | ||
:keyword answer: The answer to be evaluated. Mutually exclusive with 'conversation'. | ||
:paramtype answer: Optional[str] | ||
:return: The evaluation scores and reasoning. | ||
:rtype: dict | ||
""" | ||
|
||
return self._evaluator(question=question, answer=answer, **kwargs) | ||
|
||
|
||
class _AsyncIndirectAttackEvaluator: | ||
def __init__(self, project_scope: dict, credential=None): | ||
self._project_scope = project_scope | ||
self._credential = credential | ||
|
||
async def __call__(self, *, question: str, answer: str, **kwargs): | ||
""" | ||
Evaluates content according to this evaluator's metric. | ||
:keyword question: The question to be evaluated. | ||
:paramtype question: str | ||
:keyword answer: The answer to be evaluated. | ||
:paramtype answer: str | ||
:return: The evaluation score computation based on the metric (self.metric). | ||
:rtype: Any | ||
""" | ||
# Validate inputs | ||
# Raises value error if failed, so execution alone signifies success. | ||
if not (question and question.strip() and question != "None") or not ( | ||
answer and answer.strip() and answer != "None" | ||
): | ||
raise ValueError("Both 'question' and 'answer' must be non-empty strings.") | ||
|
||
# Run score computation based on supplied metric. | ||
result = await evaluate_with_rai_service( | ||
metric_name=EvaluationMetrics.XPIA, | ||
question=question, | ||
answer=answer, | ||
project_scope=self._project_scope, | ||
credential=self._credential, | ||
) | ||
return result | ||
|
||
|
||
class _IndirectAttackEvaluator: | ||
def __init__(self, project_scope: dict, credential=None): | ||
self._async_evaluator = _AsyncIndirectAttackEvaluator(project_scope, credential) | ||
|
||
def __call__(self, *, question: str, answer: str, **kwargs): | ||
""" | ||
Evaluates XPIA content. | ||
:keyword question: The question to be evaluated. | ||
:paramtype question: str | ||
:keyword answer: The answer to be evaluated. | ||
:paramtype answer: str | ||
:keyword context: The context to be evaluated. | ||
:paramtype context: str | ||
:return: The XPIA score. | ||
:rtype: dict | ||
""" | ||
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs) | ||
|
||
def _to_async(self): | ||
return self._async_evaluator |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from .adversarial_scenario import AdversarialScenario | ||
from .adversarial_simulator import AdversarialSimulator | ||
from .direct_attack_simulator import DirectAttackSimulator | ||
from .xpia_simulator import IndirectAttackSimulator | ||
|
||
__all__ = ["AdversarialSimulator", "AdversarialScenario", "DirectAttackSimulator"] | ||
__all__ = ["AdversarialSimulator", "AdversarialScenario", "DirectAttackSimulator", "IndirectAttackSimulator"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.