NVIDIA · jmartin-tech · Aug 29, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/garak/attempt.py b/garak/attempt.py
@@ -97,13 +97,19 @@ class Turn:
     role: str
     content: Message
 
-    @staticmethod
-    def from_dict(value: dict):
+    @classmethod
+    def from_dict(cls, value: dict):
         entity = deepcopy(value)
+        if "role" in entity.keys():
+            role = entity["role"]
+        else:
+            raise ValueError("Expected `role` in Turn dict")
         message = entity.pop("content", {})
-        entity["content"] = Message(**message)
-        ret_val = Turn(**entity)
-        return ret_val
+        if isinstance(message, str):
+            content = Message(text=message)
+        else:
+            content = Message(**message)
+        return cls(role=role, content=content)
 
 
 @dataclass
@@ -143,6 +149,21 @@ def from_dict(value: dict):
             ret_val.turns.append(Turn.from_dict(turn))
         return ret_val
 
+    @classmethod
+    def from_list(cls, turn_list: list[dict]):
+        turns = [Turn.from_dict(turn) for turn in turn_list]
+        return cls(turns=turns)
+
+    def as_dict(self) -> list[dict]:
+        """Convert Conversation object to a list of dicts.
+
+        This is needed for a number of generators.
+        """
+        turn_list = [
+            {"role": turn.role, "content": turn.content.text} for turn in self.turns
+        ]
+        return turn_list
+
 
 class Attempt:
     """A class defining objects that represent everything that constitutes a single attempt at evaluating an LLM.
@@ -151,6 +172,8 @@ class Attempt:
     :type status: int
     :param prompt: The processed prompt that will presented to the generator
     :type prompt: Union[str|Turn|Conversation]
+    :param system_prompt: System prompt derived from the generator
+    :type system_prompt: Union[Turn|str]
     :param probe_classname: Name of the probe class that originated this ``Attempt``
     :type probe_classname: str
     :param probe_params: Non-default parameters logged by the probe
@@ -173,6 +196,8 @@ class Attempt:
     :type lang: str, valid BCP47
     :param reverse_translation_outputs: The reverse translation of output based on the original language of the probe
     :param reverse_translation_outputs: List(str)
+    :param overwrite_system_prompt: Overwrite the system prompt if it is present.
+    :param overwrite_system_prompt: bool
 
     Typical use:
 
@@ -202,6 +227,7 @@ def __init__(
         self,
         status=ATTEMPT_NEW,
         prompt=None,
+        system_prompt=None,
         probe_classname=None,
         probe_params=None,
         targets=None,
@@ -211,6 +237,7 @@ def __init__(
         seq=-1,
         lang=None,  # language code for prompt as sent to the target
         reverse_translation_outputs=None,
+        overwrite_system_prompt=False,
     ) -> None:
         self.uuid = uuid.uuid4()
         if prompt is not None:
@@ -225,6 +252,13 @@ def __init__(
             if not hasattr(self, "conversations"):
                 self.conversations = [Conversation([Turn("user", msg)])]
             self.prompt = self.conversations[0]
+
+            if system_prompt is not None:
+                self._add_system_prompt(
+                    system_prompt=system_prompt,
+                    overwrite=overwrite_system_prompt,
+                    lang=lang,
+                )
         else:
             # is this the right way to model an empty Attempt?
             self.conversations = [Conversation()]
@@ -285,6 +319,12 @@ def prompt(self) -> Union[Conversation, None]:
         # exception, though that may be a reasonable trade off.
         return None
 
+    @property
+    def initial_user_message(self) -> Message:
+        for turn in self.conversations[0].turns:
+            if turn.role == "user":
+                return turn.content
+
     @property
     def lang(self):
         return self.prompt.turns[-1].content.lang
@@ -361,9 +401,9 @@ def prompt_for(self, lang) -> Conversation:
         """
         if (
             lang is not None
-            and self.conversations[0].turns[0].content.lang != "*"
+            and self.initial_user_message.lang != "*"
             and lang != "*"
-            and self.conversations[0].turns[0].content.lang != lang
+            and self.initial_user_message.lang != lang
         ):
             return self.notes.get(
                 "pre_translation_prompt", self.prompt
@@ -378,9 +418,9 @@ def outputs_for(self, lang) -> List[Message]:
         """
         if (
             lang is not None
-            and self.conversations[0].turns[0].content.lang != "*"
+            and self.initial_user_message.lang != "*"
             and lang != "*"
-            and self.conversations[0].turns[0].content.lang != lang
+            and self.initial_user_message.lang != lang
         ):
             return (
                 self.reverse_translation_outputs
@@ -431,3 +471,35 @@ def _add_turn(self, role: str, contents: List[Union[Message, str]]) -> None:
             "Conversation turn role must be one of '%s', got '%s'"
             % ("'/'".join(roles), role)
         )
+
+    def _add_system_prompt(
+        self,
+        system_prompt: Union[Turn, str],
+        overwrite: bool = False,
+        lang: Union[None, str] = None,
+    ) -> None:
+        """Add system prompt to the start of the conversation.
+
+        The system prompt is configured at the generator level.
+        This inserts the system prompt of the generator at the
+        start of the conversation if it is not present unless the
+        `overwrite` flag is set to True, which may be necessary
+        for some probes.
+        """
+        if isinstance(system_prompt, str):
+            content = Turn(
+                role="system", content=Message(text=system_prompt, lang=lang)
+            )
+        elif isinstance(system_prompt, Turn):
+            content = system_prompt
+        else:
+            raise ValueError(
+                "Could not set system prompt. Expected type `str`, `Turn`, or `Message` but got %s"
+                % type(system_prompt)
+            )
+        if self.conversations[0].turns[0].role != "system":
+            for conversation in self.conversations:
+                conversation.turns.insert(0, content)
+        elif overwrite:
+            for conversation in self.conversations:
+                conversation.turns[0] = content
diff --git a/garak/detectors/base.py b/garak/detectors/base.py
@@ -69,7 +69,8 @@ def detect(self, attempt: garak.attempt.Attempt) -> Iterable[float]:
         """Takes a list of Attempts; classifies them; returns a list of results
         in the range 0.0-1.0 where 0.0 represents no hit, 1.0 represents successful hit
         (i.e. vulnerability found)."""
-        return [0.5] * len(attempt.all_outputs)
+
+        raise NotImplementedError
 
 
 class HFDetector(Detector, HFCompatible):
@@ -131,13 +132,27 @@ def __init__(self, config_root=_config):
         transformers_logging.set_verbosity(orig_loglevel)
 
     def detect(self, attempt: garak.attempt.Attempt) -> List[float]:
-        # goal: skip None outputs
-        # don't adjust attempt.outputs
-
         all_outputs = attempt.outputs_for(self.lang_spec)
+        if len(all_outputs) == 0:
+            logging.warning(
+                "No outputs received for %s. Generator returned a list of length 0."
+                % self.__class__.__name__
+            )
+            return []
+
         non_none_outputs = [
             v.text for k, v in enumerate(all_outputs) if v and v.text is not None
         ]
+        if len(non_none_outputs) == 0:
+            logging.warning(
+                "No valid outputs received for %s. All returned entries were None."
+                % self.__class__.__name__
+            )
+        elif len(non_none_outputs) < len(all_outputs):
+            logging.info(
+                "%s skipping %d None type outputs"
+                % (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
+            )
         # non_none_offsets = [k for k,v in enumerate(all_outputs) if v is not None] # we'll want this when outputs+scores need to align
         try:
             detector_raw_results = self.detector(
@@ -183,11 +198,28 @@ def detect(
         detector_results = []
         all_outputs = attempt.outputs_for(self.lang_spec)
 
-        for output in all_outputs:
-            match = False
-            output_text = output.text if output else None
-            if output_text is None:
-                continue
+        if len(all_outputs) == 0:
+            logging.warning(
+                "No outputs received for %s. Generator returned a list of length 0."
+                % self.__class__.__name__
+            )
+            return []
+
+        non_none_outputs = [
+            v.text for k, v in enumerate(all_outputs) if v and v.text is not None
+        ]
+        if len(non_none_outputs) == 0:
+            logging.warning(
+                "No valid outputs received for %s. All returned entries were None."
+                % self.__class__.__name__,
+            )
+        elif len(non_none_outputs) < len(all_outputs):
+            logging.info(
+                "%s skipping %d None type outputs"
+                % (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
+            )
+
+        for output_text in non_none_outputs:
             match = False
             for s in self.substrings:
                 if not case_sensitive:
@@ -219,6 +251,25 @@ def detect(
     ) -> Iterable[float]:
         detector_results = []
         all_outputs = attempt.outputs_for(self.lang_spec)
+        if len(all_outputs) == 0:
+            logging.warning(
+                "No outputs received for %s. Generator returned a list of length 0."
+                % self.__class__.__name__,
+            )
+
+        non_none_outputs = [
+            v.text for k, v in enumerate(all_outputs) if v and v.text is not None
+        ]
+        if len(non_none_outputs) == 0:
+            logging.warning(
+                "No valid outputs received for %s. All returned entries were None."
+                % self.__class__.__name__,
+            )
+        elif len(non_none_outputs) < len(all_outputs):
+            logging.info(
+                "%s skipping %d None type outputs"
+                % (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
+            )
 
         if "triggers" in attempt.notes:
             triggers = attempt.notes["triggers"]

diff --git a/garak/detectors/continuation.py b/garak/detectors/continuation.py
@@ -4,6 +4,7 @@
 considering how much of it is in the prompt already
 """
 
+import logging
 import re
 from typing import List
 
@@ -22,16 +23,34 @@ def detect(self, attempt: Attempt, case_sensitive=False) -> List[float]:
         if triggers == []:
             return results
 
-        for output in attempt.all_outputs:
-            if output is None or output.text is None:
-                continue
+        all_outputs = attempt.outputs_for(self.lang_spec)
+        if len(all_outputs) == 0:
+            logging.warning(
+                "No outputs received for %s. Generator returned a list of length 0."
+                % self.__class__.__name__,
+            )
 
+        non_none_outputs = [
+            v.text for k, v in enumerate(all_outputs) if v and v.text is not None
+        ]
+        if len(non_none_outputs) == 0:
+            logging.warning(
+                "No valid outputs received for %s. All returned entries were None."
+                % self.__class__.__name__,
+            )
+        elif len(non_none_outputs) < len(all_outputs):
+            logging.info(
+                "%s skipping %d None type outputs"
+                % (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
+            )
+
+        for output in non_none_outputs:
             # continuation models might not include the whole slur in their output, so we should build some context in
-            # do this by concatenating the end of the prompt with the beginning of the output, and then scanning that context
+            # Do this by concatenating the end of the prompt with the beginning of the output, and then scanning that
+            # context
 
             # reduce leading spaces in output
-            trimmed_output = re.sub(r"^\s+", " ", output.text)
-
+            trimmed_output = re.sub(r"^\s+", "", output)
             hit_found = False
             for trigger in triggers:
                 context = (

diff --git a/garak/detectors/divergence.py b/garak/detectors/divergence.py
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: Portions Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-
+import logging
 import re
 from typing import List
 
@@ -85,15 +85,32 @@ def detect(self, attempt: Attempt, case_sensitive=False) -> List[float]:
             return detector_results
 
         repeat_word = attempt.notes["repeat_word"]
-        for output in attempt.all_outputs:
-            output_text = output.text if output else None
-            if output_text is None:
-                continue
+        all_outputs = attempt.outputs_for(self.lang_spec)
+        if len(all_outputs) == 0:
+            logging.warning(
+                "No outputs received for %s. Generator returned a list of length 0."
+                % self.__class__.__name__,
+            )
+
+        non_none_outputs = [
+            v.text for k, v in enumerate(all_outputs) if v and v.text is not None
+        ]
+        if len(non_none_outputs) == 0:
+            logging.warning(
+                "No valid outputs received for %s. All returned entries were None."
+                % self.__class__.__name__,
+            )
+        elif len(non_none_outputs) < len(all_outputs):
+            logging.info(
+                "%s skipping %d None type outputs"
+                % (self.__class__.__name__, (len(all_outputs) - len(non_none_outputs)))
+            )
+        for output in non_none_outputs:
             if not case_sensitive:
                 repeat_word = repeat_word.lower()
-                output_text = output_text.lower()
+                output = output.lower()
             detector_results.append(
-                1.0 if self._instance_is_a_hit(output_text, repeat_word) else 0.0
+                1.0 if self._instance_is_a_hit(output, repeat_word) else 0.0
             )
 
         return detector_results