Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
42aa795
Initial approach for configurable system prompt
erickgalinkin Aug 18, 2025
d916cda
Add system prompt support in `Attempt`. Add helpful logging to `detec…
erickgalinkin Aug 18, 2025
3fa610a
Add `as_dict()` functionality to `Conversation` objects. Update `Open…
erickgalinkin Aug 18, 2025
0d7bc1c
Make code more DRY by using `as_dict` method throughout.
erickgalinkin Aug 19, 2025
436f781
Fix `as_dict` method
erickgalinkin Aug 19, 2025
38f5a61
Better attribute check
erickgalinkin Aug 19, 2025
9dfe466
Add `from_list()` method to `Conversation`. Refactor `from_dict` meth…
erickgalinkin Aug 19, 2025
3a79ff0
Fix `Turn.from_dict` classmethod. Fix issue with prompt being set inc…
erickgalinkin Aug 21, 2025
58d4b60
Fix detectors, add better logging for skips. Change `detectors.base.D…
erickgalinkin Aug 21, 2025
7d7d1dd
Improve detector logic and logging.
erickgalinkin Aug 21, 2025
0ac1d86
Improve detector logic and logging.
erickgalinkin Aug 21, 2025
198fd79
Remove _format_chat_prompt call from test_huggingface.py
erickgalinkin Aug 21, 2025
68b19f8
Fix issue in judge.py. Fix and add additional detector-related loggin…
erickgalinkin Aug 21, 2025
349f4cb
Fix judge.py and productkey.py bugs
erickgalinkin Aug 22, 2025
b0b512a
Fix lang_spec in Win5x5
erickgalinkin Aug 22, 2025
5acfd97
Revert functional change to `StartsWith` detector
erickgalinkin Aug 22, 2025
75d55bc
Revert changes to test_detectors.py
erickgalinkin Aug 22, 2025
5dc458e
Revert a bunch of changes. Move `system_prompt` to be a `run` paramet…
erickgalinkin Aug 25, 2025
2b107f4
Tests and docs for system prompt
erickgalinkin Aug 25, 2025
1225236
Refactor `conversation_to_list` to private method. Improve handling o…
erickgalinkin Aug 26, 2025
4987275
Change ValueError to `logging.warning` for atkgen.Tox.
erickgalinkin Aug 26, 2025
2793939
Fix hf test
erickgalinkin Aug 26, 2025
54494ae
Apply suggestions from code review
erickgalinkin Aug 28, 2025
10f1338
Fix atkgen expectations with sysprompt
erickgalinkin Aug 28, 2025
c021f75
Indentation error fix
erickgalinkin Aug 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 81 additions & 9 deletions garak/attempt.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,19 @@ class Turn:
role: str
content: Message

@staticmethod
def from_dict(value: dict):
@classmethod
def from_dict(cls, value: dict):
entity = deepcopy(value)
if "role" in entity.keys():
role = entity["role"]
else:
raise ValueError("Expected `role` in Turn dict")
message = entity.pop("content", {})
entity["content"] = Message(**message)
ret_val = Turn(**entity)
return ret_val
if isinstance(message, str):
content = Message(text=message)
else:
content = Message(**message)
return cls(role=role, content=content)


@dataclass
Expand Down Expand Up @@ -143,6 +149,21 @@ def from_dict(value: dict):
ret_val.turns.append(Turn.from_dict(turn))
return ret_val

@classmethod
def from_list(cls, turn_list: list[dict]):
turns = [Turn.from_dict(turn) for turn in turn_list]
return cls(turns=turns)

def as_dict(self) -> list[dict]:
"""Convert Conversation object to a list of dicts.

This is needed for a number of generators.
"""
turn_list = [
{"role": turn.role, "content": turn.content.text} for turn in self.turns
]
return turn_list


class Attempt:
"""A class defining objects that represent everything that constitutes a single attempt at evaluating an LLM.
Expand All @@ -151,6 +172,8 @@ class Attempt:
:type status: int
:param prompt: The processed prompt that will presented to the generator
:type prompt: Union[str|Turn|Conversation]
:param system_prompt: System prompt derived from the generator
:type system_prompt: Union[Turn|str]
:param probe_classname: Name of the probe class that originated this ``Attempt``
:type probe_classname: str
:param probe_params: Non-default parameters logged by the probe
Expand All @@ -173,6 +196,8 @@ class Attempt:
:type lang: str, valid BCP47
:param reverse_translation_outputs: The reverse translation of output based on the original language of the probe
:param reverse_translation_outputs: List(str)
:param overwrite_system_prompt: Overwrite the system prompt if it is present.
:param overwrite_system_prompt: bool

Typical use:

Expand Down Expand Up @@ -202,6 +227,7 @@ def __init__(
self,
status=ATTEMPT_NEW,
prompt=None,
system_prompt=None,
probe_classname=None,
probe_params=None,
targets=None,
Expand All @@ -211,6 +237,7 @@ def __init__(
seq=-1,
lang=None, # language code for prompt as sent to the target
reverse_translation_outputs=None,
overwrite_system_prompt=False,
) -> None:
self.uuid = uuid.uuid4()
if prompt is not None:
Expand All @@ -225,6 +252,13 @@ def __init__(
if not hasattr(self, "conversations"):
self.conversations = [Conversation([Turn("user", msg)])]
self.prompt = self.conversations[0]

if system_prompt is not None:
self._add_system_prompt(
system_prompt=system_prompt,
overwrite=overwrite_system_prompt,
lang=lang,
)
else:
# is this the right way to model an empty Attempt?
self.conversations = [Conversation()]
Expand Down Expand Up @@ -285,6 +319,12 @@ def prompt(self) -> Union[Conversation, None]:
# exception, though that may be a reasonable trade off.
return None

@property
def initial_user_message(self) -> Message:
for turn in self.conversations[0].turns:
if turn.role == "user":
return turn.content

@property
def lang(self):
return self.prompt.turns[-1].content.lang
Expand Down Expand Up @@ -361,9 +401,9 @@ def prompt_for(self, lang) -> Conversation:
"""
if (
lang is not None
and self.conversations[0].turns[0].content.lang != "*"
and self.initial_user_message.lang != "*"
and lang != "*"
and self.conversations[0].turns[0].content.lang != lang
and self.initial_user_message.lang != lang
):
return self.notes.get(
"pre_translation_prompt", self.prompt
Expand All @@ -378,9 +418,9 @@ def outputs_for(self, lang) -> List[Message]:
"""
if (
lang is not None
and self.conversations[0].turns[0].content.lang != "*"
and self.initial_user_message.lang != "*"
and lang != "*"
and self.conversations[0].turns[0].content.lang != lang
and self.initial_user_message.lang != lang
):
return (
self.reverse_translation_outputs
Expand Down Expand Up @@ -431,3 +471,35 @@ def _add_turn(self, role: str, contents: List[Union[Message, str]]) -> None:
"Conversation turn role must be one of '%s', got '%s'"
% ("'/'".join(roles), role)
)

def _add_system_prompt(
self,
system_prompt: Union[Turn, str],
overwrite: bool = False,
lang: Union[None, str] = None,
) -> None:
"""Add system prompt to the start of the conversation.

The system prompt is configured at the generator level.
This inserts the system prompt of the generator at the
start of the conversation if it is not present unless the
`overwrite` flag is set to True, which may be necessary
for some probes.
"""
if isinstance(system_prompt, str):
content = Turn(
role="system", content=Message(text=system_prompt, lang=lang)
)
elif isinstance(system_prompt, Turn):
content = system_prompt
else:
raise ValueError(
"Could not set system prompt. Expected type `str`, `Turn`, or `Message` but got %s"
% type(system_prompt)
)
if self.conversations[0].turns[0].role != "system":
for conversation in self.conversations:
conversation.turns.insert(0, content)
elif overwrite:
for conversation in self.conversations:
conversation.turns[0] = content
69 changes: 60 additions & 9 deletions garak/detectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def detect(self, attempt: garak.attempt.Attempt) -> Iterable[float]:
"""Takes a list of Attempts; classifies them; returns a list of results
in the range 0.0-1.0 where 0.0 represents no hit, 1.0 represents successful hit
(i.e. vulnerability found)."""
return [0.5] * len(attempt.all_outputs)

raise NotImplementedError


class HFDetector(Detector, HFCompatible):
Expand Down Expand Up @@ -131,13 +132,27 @@ def __init__(self, config_root=_config):
transformers_logging.set_verbosity(orig_loglevel)

def detect(self, attempt: garak.attempt.Attempt) -> List[float]:
# goal: skip None outputs
# don't adjust attempt.outputs

all_outputs = attempt.outputs_for(self.lang_spec)
if len(all_outputs) == 0:
logging.warning(
"No outputs received for %s. Generator returned a list of length 0."
% self.__class__.__name__
)
return []

non_none_outputs = [
v.text for k, v in enumerate(all_outputs) if v and v.text is not None
]
if len(non_none_outputs) == 0:
logging.warning(
"No valid outputs received for %s. All returned entries were None."
% self.__class__.__name__
)
elif len(non_none_outputs) < len(all_outputs):
logging.info(
"%s skipping %d None type outputs"
% (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
)
# non_none_offsets = [k for k,v in enumerate(all_outputs) if v is not None] # we'll want this when outputs+scores need to align
try:
detector_raw_results = self.detector(
Expand Down Expand Up @@ -183,11 +198,28 @@ def detect(
detector_results = []
all_outputs = attempt.outputs_for(self.lang_spec)

for output in all_outputs:
match = False
output_text = output.text if output else None
if output_text is None:
continue
if len(all_outputs) == 0:
logging.warning(
"No outputs received for %s. Generator returned a list of length 0."
% self.__class__.__name__
)
return []

non_none_outputs = [
v.text for k, v in enumerate(all_outputs) if v and v.text is not None
]
if len(non_none_outputs) == 0:
logging.warning(
"No valid outputs received for %s. All returned entries were None."
% self.__class__.__name__,
)
elif len(non_none_outputs) < len(all_outputs):
logging.info(
"%s skipping %d None type outputs"
% (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
)

for output_text in non_none_outputs:
match = False
for s in self.substrings:
if not case_sensitive:
Expand Down Expand Up @@ -219,6 +251,25 @@ def detect(
) -> Iterable[float]:
detector_results = []
all_outputs = attempt.outputs_for(self.lang_spec)
if len(all_outputs) == 0:
logging.warning(
"No outputs received for %s. Generator returned a list of length 0."
% self.__class__.__name__,
)

non_none_outputs = [
v.text for k, v in enumerate(all_outputs) if v and v.text is not None
]
if len(non_none_outputs) == 0:
logging.warning(
"No valid outputs received for %s. All returned entries were None."
% self.__class__.__name__,
)
elif len(non_none_outputs) < len(all_outputs):
logging.info(
"%s skipping %d None type outputs"
% (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
)

if "triggers" in attempt.notes:
triggers = attempt.notes["triggers"]
Expand Down
31 changes: 25 additions & 6 deletions garak/detectors/continuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
considering how much of it is in the prompt already
"""

import logging
import re
from typing import List

Expand All @@ -22,16 +23,34 @@ def detect(self, attempt: Attempt, case_sensitive=False) -> List[float]:
if triggers == []:
return results

for output in attempt.all_outputs:
if output is None or output.text is None:
continue
all_outputs = attempt.outputs_for(self.lang_spec)
if len(all_outputs) == 0:
logging.warning(
"No outputs received for %s. Generator returned a list of length 0."
% self.__class__.__name__,
)

non_none_outputs = [
v.text for k, v in enumerate(all_outputs) if v and v.text is not None
]
if len(non_none_outputs) == 0:
logging.warning(
"No valid outputs received for %s. All returned entries were None."
% self.__class__.__name__,
)
elif len(non_none_outputs) < len(all_outputs):
logging.info(
"%s skipping %d None type outputs"
% (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
)

for output in non_none_outputs:
# continuation models might not include the whole slur in their output, so we should build some context in
# do this by concatenating the end of the prompt with the beginning of the output, and then scanning that context
# Do this by concatenating the end of the prompt with the beginning of the output, and then scanning that
# context

# reduce leading spaces in output
trimmed_output = re.sub(r"^\s+", " ", output.text)

trimmed_output = re.sub(r"^\s+", "", output)
hit_found = False
for trigger in triggers:
context = (
Expand Down
31 changes: 24 additions & 7 deletions garak/detectors/divergence.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-FileCopyrightText: Portions Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import re
from typing import List

Expand Down Expand Up @@ -85,15 +85,32 @@ def detect(self, attempt: Attempt, case_sensitive=False) -> List[float]:
return detector_results

repeat_word = attempt.notes["repeat_word"]
for output in attempt.all_outputs:
output_text = output.text if output else None
if output_text is None:
continue
all_outputs = attempt.outputs_for(self.lang_spec)
if len(all_outputs) == 0:
logging.warning(
"No outputs received for %s. Generator returned a list of length 0."
% self.__class__.__name__,
)

non_none_outputs = [
v.text for k, v in enumerate(all_outputs) if v and v.text is not None
]
if len(non_none_outputs) == 0:
logging.warning(
"No valid outputs received for %s. All returned entries were None."
% self.__class__.__name__,
)
elif len(non_none_outputs) < len(all_outputs):
logging.info(
"%s skipping %d None type outputs"
% (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
)
for output in non_none_outputs:
if not case_sensitive:
repeat_word = repeat_word.lower()
output_text = output_text.lower()
output = output.lower()
detector_results.append(
1.0 if self._instance_is_a_hit(output_text, repeat_word) else 0.0
1.0 if self._instance_is_a_hit(output, repeat_word) else 0.0
)

return detector_results
Loading
Loading