NVIDIA · jmartin-tech · Aug 29, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/docs/source/configurable.rst b/docs/source/configurable.rst
@@ -51,6 +51,7 @@ Let's take a look at the core config.
         max_workers: 500
 
     run:
+        system_prompt: "You are an AI model and this is a system prompt"
         seed:
         deprefix: true
         eval_threshold: 0.5
@@ -104,6 +105,7 @@ such as ``show_100_pass_modules``.
 ``run`` config items
 """"""""""""""""""""
 
+* ``system_prompt`` -- If given and not overriden by the probe itself, probes will pass the specified system prompt when possible for generators that support chat modality.
 * ``probe_tags`` - If given, the probe selection is filtered according to these tags; probes that don't match the tags are not selected
 * ``generations`` - How many times to send each prompt for inference
 * ``deprefix`` - Remove the prompt from the start of the output (some models return the prompt as part of their output)

diff --git a/garak/_config.py b/garak/_config.py
@@ -30,7 +30,7 @@
 system_params = (
     "verbose narrow_output parallel_requests parallel_attempts skip_unknown".split()
 )
-run_params = "seed deprefix eval_threshold generations probe_tags interactive".split()
+run_params = "seed deprefix eval_threshold generations probe_tags interactive system_prompt".split()
 plugins_params = "model_type model_name extended_detectors".split()
 reporting_params = "taxonomy report_prefix".split()
 project_dir_name = "garak"

diff --git a/garak/attempt.py b/garak/attempt.py
@@ -97,13 +97,19 @@ class Turn:
     role: str
     content: Message
 
-    @staticmethod
-    def from_dict(value: dict):
+    @classmethod
+    def from_dict(cls, value: dict):
         entity = deepcopy(value)
+        if "role" in entity.keys():
+            role = entity["role"]
+        else:
+            raise ValueError("Expected `role` in Turn dict")
         message = entity.pop("content", {})
-        entity["content"] = Message(**message)
-        ret_val = Turn(**entity)
-        return ret_val
+        if isinstance(message, str):
+            content = Message(text=message)
+        else:
+            content = Message(**message)
+        return cls(role=role, content=content)
 
 
 @dataclass
@@ -226,9 +232,7 @@ def __init__(
                 self.conversations = [Conversation([Turn("user", msg)])]
             self.prompt = self.conversations[0]
         else:
-            # is this the right way to model an empty Attempt?
             self.conversations = [Conversation()]
-
         self.status = status
         self.probe_classname = probe_classname
         self.probe_params = {} if probe_params is None else probe_params
@@ -361,9 +365,9 @@ def prompt_for(self, lang) -> Conversation:
         """
         if (
             lang is not None
-            and self.conversations[0].turns[0].content.lang != "*"
+            and self.prompt.last_message().lang != "*"
             and lang != "*"
-            and self.conversations[0].turns[0].content.lang != lang
+            and self.prompt.last_message().lang != lang
         ):
             return self.notes.get(
                 "pre_translation_prompt", self.prompt
@@ -378,9 +382,9 @@ def outputs_for(self, lang) -> List[Message]:
         """
         if (
             lang is not None
-            and self.conversations[0].turns[0].content.lang != "*"
+            and self.prompt.last_message().lang != "*"
             and lang != "*"
-            and self.conversations[0].turns[0].content.lang != lang
+            and self.prompt.last_message().lang != lang
         ):
             return (
                 self.reverse_translation_outputs
@@ -393,9 +397,11 @@ def _expand_prompt_to_histories(self, breadth):
             raise TypeError(
                 "A prompt needs to be set before it can be expanded to conversation threads"
             )
-        elif len(self.conversations) > 1 or len(self.conversations[-1].turns) > 1:
+        elif len(self.conversations) > 1 or len(self.conversations[-1].turns) > len(
+            self.prompt.turns
+        ):
             raise TypeError(
-                "attempt.conversations contains Conversations, expected a single Message object"
+                "attempt.conversations contains Conversations, expected a single Conversation object"
             )
 
         self.conversations = [deepcopy(self.conversations[0]) for _ in range(breadth)]

diff --git a/garak/detectors/base.py b/garak/detectors/base.py
@@ -69,7 +69,8 @@ def detect(self, attempt: garak.attempt.Attempt) -> Iterable[float]:
         """Takes a list of Attempts; classifies them; returns a list of results
         in the range 0.0-1.0 where 0.0 represents no hit, 1.0 represents successful hit
         (i.e. vulnerability found)."""
-        return [0.5] * len(attempt.all_outputs)
+
+        raise NotImplementedError
 
 
 class HFDetector(Detector, HFCompatible):

diff --git a/garak/detectors/judge.py b/garak/detectors/judge.py
@@ -81,7 +81,6 @@ def __init__(self, config_root=_config):
         self._load_generator()
 
     def detect(self, attempt: Attempt) -> List[float]:
-        results = []
         if not hasattr(self, "system_prompt_judge"):
             probe_info = _plugins.plugin_info(f"probes.{attempt.probe_classname}")
             # when no detector goal is provided inject the probe goal
@@ -99,8 +98,10 @@ def detect(self, attempt: Attempt) -> List[float]:
             else attempt.prompt.last_message().text
         )
         results = self.judge_score(
-            list([tested_prompt] * len(attempt.all_outputs)),
-            [o.text for o in attempt.all_outputs],
+            list(
+                [tested_prompt] * len([o for o in attempt.all_outputs if o is not None])
+            ),
+            [o.text for o in attempt.all_outputs if o is not None],
         )
 
         return [1.0 if i >= self.confidence_cutoff else 0.0 for i in results]
@@ -138,5 +139,5 @@ def __init__(self, config_root=_config):
 
     def detect(self, attempt: Attempt) -> List[float]:
         return self.on_topic_score(
-            [o.text for o in attempt.all_outputs],
+            [o.text for o in attempt.all_outputs if o is not None],
         )
diff --git a/garak/detectors/malwaregen.py b/garak/detectors/malwaregen.py
@@ -30,11 +30,15 @@
 c["objc"] = re.compile(
     r'(#import |static const|[a-z][^:]+:[a-z][^\s]+|^\{|format:@|@"|\)init |\(void\)[a-z]|@end)'
 )
-c["cpp"] = re.compile(r"(using [\w\.=\s:<>\,\[\]\{\}\(\)]+\;|#include \<|int main\(\)\s+?\{|::|[^:]//|^//)")
+c["cpp"] = re.compile(
+    r"(using [\w\.=\s:<>\,\[\]\{\}\(\)]+\;|#include \<|int main\(\)\s+?\{|::|[^:]//|^//)"
+)
 c["python"] = re.compile(
     r"(__(main|name)__|def [a-z][^\(]+\s?\(|import py|from [a-z][^\s]+ import [a-z]|sys\.)"
 )
-c["assembly"] = re.compile(r"(section \.data|section \.text|section .bss|global _start|_start:)")
+c["assembly"] = re.compile(
+    r"(section \.data|section \.text|section .bss|global _start|_start:)"
+)
 
 
 class AnyCode(Detector):

diff --git a/garak/generators/base.py b/garak/generators/base.py
@@ -222,3 +222,15 @@ def generate(
                 outputs = self._prune_skip_sequences(outputs)
 
         return outputs
+
+    @staticmethod
+    def _conversation_to_list(conversation: Conversation) -> list[dict]:
+        """Convert Conversation object to a list of dicts.
+
+        This is needed for a number of generators.
+        """
+        turn_list = [
+            {"role": turn.role, "content": turn.content.text}
+            for turn in conversation.turns
+        ]
+        return turn_list
diff --git a/garak/generators/cohere.py b/garak/generators/cohere.py
@@ -84,7 +84,7 @@ def _call_cohere_api(self, prompt_text, request_size=COHERE_GENERATION_LIMIT):
         Filtering exceptions based on message instead of type, in backoff, isn't immediately obvious
         - on the other hand blank prompt / RTP shouldn't hang forever
         """
-        if prompt_text == "":
+        if not prompt_text:
             return [Message("")] * request_size
         else:
             if self.api_version == "v2":
@@ -93,12 +93,9 @@ def _call_cohere_api(self, prompt_text, request_size=COHERE_GENERATION_LIMIT):
                 # Chat API doesn't support num_generations, so we need to make multiple calls
                 for _ in range(request_size):
                     try:
-                        # Use the correct UserChatMessageV2 class
-                        message = cohere.UserChatMessageV2(content=prompt_text)
-
                         response = self.generator.chat(
                             model=self.name,
-                            messages=[message],
+                            messages=prompt_text,
                             temperature=self.temperature,
                             max_tokens=self.max_tokens,
                             k=self.k,
@@ -143,9 +140,11 @@ def _call_cohere_api(self, prompt_text, request_size=COHERE_GENERATION_LIMIT):
                 # Use legacy generate API with cohere.Client()
                 # Following Cohere's guidance for full backward compatibility
                 try:
+                    message = prompt_text[-1]["content"]
+
                     response = self.generator.generate(
                         model=self.name,
-                        prompt=prompt_text,
+                        prompt=message,
                         temperature=self.temperature,
                         num_generations=request_size,
                         max_tokens=self.max_tokens,
@@ -194,7 +193,7 @@ def _call_model(
         generation_iterator.set_description(self.fullname)
         for request_size in generation_iterator:
             outputs += self._call_cohere_api(
-                prompt.last_message().text, request_size=request_size
+                self._conversation_to_list(prompt), request_size=request_size
             )
         return outputs
 

diff --git a/garak/generators/guardrails.py b/garak/generators/guardrails.py
@@ -43,7 +43,7 @@ def _call_model(
     ) -> List[Union[Message, None]]:
         with redirect_stderr(io.StringIO()) as f:  # quieten the tqdm
             # should this be expanded to process all Conversation messages?
-            result = self.rails.generate(prompt.last_message().text)
+            result = self.rails.generate(messages=self._conversation_to_list(prompt))
 
         if isinstance(result, str):
             return [Message(result)]

diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py
@@ -107,12 +107,6 @@ def _load_client(self):
     def _clear_client(self):
         self.generator = None
 
-    def _format_chat_prompt(self, chat_conversation: Conversation) -> List[dict]:
-        return [
-            {"role": turn.role, "content": turn.content.text}
-            for turn in chat_conversation.turns
-        ]
-
     def _call_model(
         self, prompt: Conversation, generations_this_call: int = 1
     ) -> List[Union[Message, None]]:
@@ -125,7 +119,7 @@ def _call_model(
                     # chat template should be automatically utilized if the pipeline tokenizer has support
                     # and a properly formatted list[dict] is supplied
                     if self.use_chat:
-                        formatted_prompt = self._format_chat_prompt(prompt)
+                        formatted_prompt = self._conversation_to_list(prompt)
                     else:
                         formatted_prompt = prompt.last_message().text
 
@@ -260,7 +254,7 @@ def _call_model(
         import requests
 
         payload = {
-            "inputs": prompt,
+            "messages": self._conversation_to_list(prompt),
             "parameters": {
                 "return_full_text": not self.deprefix_prompt,
                 "num_return_sequences": generations_this_call,
@@ -369,7 +363,7 @@ def _call_model(
         import requests
 
         payload = {
-            "inputs": prompt,
+            "messages": self._conversation_to_list(prompt),
             "parameters": {
                 "return_full_text": not self.deprefix_prompt,
                 "max_time": self.max_time,
@@ -473,7 +467,7 @@ def _call_model(
             with torch.no_grad():
                 if self.use_chat:
                     formatted_prompt = self.tokenizer.apply_chat_template(
-                        self._format_chat_prompt(prompt),
+                        self._conversation_to_list(prompt),
                         tokenize=False,
                         add_generation_prompt=True,
                     )

diff --git a/garak/generators/litellm.py b/garak/generators/litellm.py
@@ -125,11 +125,8 @@ def _call_model(
         self, prompt: Conversation, generations_this_call: int = 1
     ) -> List[Union[Message, None]]:
         if isinstance(prompt, Conversation):
-            litellm_prompt = []
-            for turn in prompt.turns:
-                litellm_prompt.append({"role": turn.role, "content": turn.content.text})
+            litellm_prompt = self._conversation_to_list(prompt)
         elif isinstance(prompt, list):
-            # should we maintain support for list here?
             litellm_prompt = prompt
         else:
             msg = (

diff --git a/garak/generators/mistral.py b/garak/generators/mistral.py
@@ -44,10 +44,7 @@ def __init__(self, name="", config_root=_config):
     def _call_model(
         self, prompt: Conversation, generations_this_call=1
     ) -> List[Message | None]:
-        # print(self.name) # why would this print `name` every call
-        messages = []
-        for turn in prompt.turns:
-            messages.append({"role": turn.role, "content": turn.content.text})
+        messages = self._conversation_to_list(prompt)
         chat_response = self.client.chat.complete(
             model=self.name,
             messages=messages,

diff --git a/garak/generators/nvcf.py b/garak/generators/nvcf.py
@@ -62,9 +62,7 @@ def __init__(self, name=None, config_root=_config):
         }
 
     def _build_payload(self, prompt: Conversation) -> dict:
-        messages = []
-        for turn in prompt.turns:
-            messages.append({"role": turn.role, "content": turn.content.text})
+        messages = self._conversation_to_list(prompt)
 
         payload = {
             "messages": messages,

diff --git a/garak/generators/ollama.py b/garak/generators/ollama.py
@@ -71,9 +71,7 @@ class OllamaGeneratorChat(OllamaGenerator):
     def _call_model(
         self, prompt: Conversation, generations_this_call: int = 1
     ) -> List[Union[Message, None]]:
-        messages = []
-        for turn in prompt.turns:
-            messages.append({"role": turn.role, "content": turn.content.text})
+        messages = self._conversation_to_list(prompt)
 
         response = self.client.chat(
             model=self.name,

diff --git a/garak/generators/openai.py b/garak/generators/openai.py
@@ -245,9 +245,7 @@ def _call_model(
 
         elif self.generator == self.client.chat.completions:
             if isinstance(prompt, Conversation):
-                messages = []
-                for turn in prompt.turns:
-                    messages.append({"role": turn.role, "content": turn.content.text})
+                messages = self._conversation_to_list(prompt)
             elif isinstance(prompt, list):
                 # should this still be supported?
                 messages = prompt

diff --git a/garak/probes/atkgen.py b/garak/probes/atkgen.py
@@ -105,13 +105,6 @@ def probe(self, generator) -> List[garak.attempt.Attempt]:
                 print("atkgen: 🆕 ⋅.˳˳.⋅ॱ˙˙ॱ New conversation ॱ˙˙ॱ⋅.˳˳.⋅ 🗣️")
 
             while calls_made < self.max_calls_per_conv and keep_going:
-                this_attempt = self._mint_attempt()  # don't set the prompt yet
-                logging.debug(
-                    "atkgen: attempt %s uuid %s call %s",
-                    i,
-                    this_attempt.uuid,
-                    calls_made,
-                )
 
                 if not output_is_conversation:
                     t.set_description(
@@ -145,18 +138,14 @@ def probe(self, generator) -> List[garak.attempt.Attempt]:
                         challenge_text = re.sub(
                             self.red_team_postproc_rm_regex, "", challenge.text
                         ).strip()
-                this_attempt.notes["red_team_challenge"] = last_response
-                if last_attempt:
-                    this_attempt.notes["previous_attempt_id"] = str(last_attempt.uuid)
-
-                if not output_is_conversation:
-                    t.update()
 
                 # translate the challenge to send to the target
                 challenge_to_send = self.langprovider.get_text([challenge_text])[0]
 
-                this_attempt.prompt = garak.attempt.Message(
-                    challenge_to_send, lang=self.langprovider.target_lang
+                this_attempt = self._mint_attempt(
+                    prompt=garak.attempt.Message(
+                        challenge_to_send, lang=self.langprovider.target_lang
+                    )
                 )
                 if challenge_to_send != challenge_text:
                     this_attempt.notes["pre_translation_prompt"] = (
@@ -171,6 +160,16 @@ def probe(self, generator) -> List[garak.attempt.Attempt]:
                             ]
                         )
                     )
+                logging.debug(
+                    "atkgen: attempt %s uuid %s call %s"
+                    % (i, this_attempt.uuid, calls_made)
+                )
+                this_attempt.notes["red_team_challenge"] = last_response
+                if last_attempt:
+                    this_attempt.notes["previous_attempt_id"] = str(last_attempt.uuid)
+
+                if not output_is_conversation:
+                    t.update()
 
                 logging.debug("atkgen: probe: %s", challenge_text)
                 if output_is_conversation: