judge detectors conversation refactor (NVIDIA#1346)

jmartin-tech · jmartin-tech · commit 94a33b4a6e0e · 2025-08-28T11:39:24.000-05:00
diff --git a/garak/attempt.py b/garak/attempt.py
@@ -129,7 +129,7 @@ def last_message(self, role=None) -> Message:
             raise ValueError("No messages available")
         if not role:
             return self.turns[-1].content
-        for idx in range(len(self.turns), 0):
+        for idx in range(len(self.turns) - 1, -1, -1):
             if role == self.turns[idx].role:
                 return self.turns[idx].content
         raise ValueError(f"No messages for role: {role}")
diff --git a/garak/resources/red_team/evaluation.py b/garak/resources/red_team/evaluation.py
@@ -122,10 +122,11 @@ def judge_score(self, attack_prompt_list, target_response_list):
             Conversation(
                 [
                     Turn(
-                        "user",
-                        Message(
-                            self._create_conv(get_evaluator_prompt(prompt, response))
-                        ),
+                        role=turn["role"],
+                        content=Message(turn["content"]),
+                    )
+                    for turn in self._create_conv(
+                        get_evaluator_prompt(prompt, response)
                     )
                 ]
             )
@@ -142,13 +143,12 @@ def on_topic_score(self, attempt_list):
             Conversation(
                 [
                     Turn(
-                        "user",
-                        Message(
-                            self._create_conv(
-                                get_evaluator_prompt_on_topic(prompt),
-                                system_prompt=self.system_prompt_on_topic,
-                            )
-                        ),
+                        role=turn["role"],
+                        content=Message(turn["content"]),
+                    )
+                    for turn in self._create_conv(
+                        get_evaluator_prompt_on_topic(prompt),
+                        system_prompt=self.system_prompt_on_topic,
                     )
                 ]
             )
diff --git a/tests/detectors/test_detectors_judge.py b/tests/detectors/test_detectors_judge.py
@@ -69,5 +69,11 @@ def test_klass_detect(judge_config, mocker, klass, response, res_val):
     a.outputs = [Message("fake")]
     res = d.detect(a)
     mock_generate.assert_called_once()
+    assert (
+        len(mock_generate.call_args_list[0].args[0].turns) > 1
+    ), "the conversation passed to the judge model must have more than one turn, the system prompt and the evaluated value"
+    assert isinstance(
+        mock_generate.call_args_list[0].args[0].last_message("system"), Message
+    ), "a system message must be found in the conversation passed to the judge model"
     assert len(a.all_outputs) == len(res)
     assert [res_val * len(a.all_outputs)] == res
diff --git a/tests/test_attempt.py b/tests/test_attempt.py
@@ -105,6 +105,30 @@ def test_conversation_internal_serialize():
     assert src_conv == dest
 
 
+def test_last_message():
+    test_system_msg = garak.attempt.Message("the system is under control")
+    test_user_msg = garak.attempt.Message(
+        "But the point is, if you lie all the time, nobody's going to believe you, even when you're telling the truth."
+    )
+    test_assistant_msg = garak.attempt.Message("AI does not understand")
+    test_user_msg_2 = garak.attempt.Message("That figures")
+
+    turns = [
+        garak.attempt.Turn("system", test_system_msg),
+        garak.attempt.Turn("user", test_user_msg),
+        garak.attempt.Turn("assistant", test_assistant_msg),
+    ]
+    conv = garak.attempt.Conversation(turns)
+    assert conv.last_message() == test_assistant_msg
+    assert conv.last_message("system") == test_system_msg
+    assert conv.last_message("user") == test_user_msg
+
+    new_turn = garak.attempt.Turn("user", test_user_msg_2)
+    conv.turns.append(new_turn)
+    assert conv.last_message("user") == test_user_msg_2
+    assert conv.last_message() == test_user_msg_2
+
+
 ##########################
 # Test Attempt LifeCycle #
 ##########################

Original file line number	Diff line number	Diff line change
`@@ -122,10 +122,11 @@ def judge_score(self, attack_prompt_list, target_response_list):`
`122`	`122`	`Conversation(`
`123`	`123`	`[`
`124`	`124`	`Turn(`
`125`		`- "user",`
`126`		`- Message(`
`127`		`- self._create_conv(get_evaluator_prompt(prompt, response))`
`128`		`- ),`
	`125`	`+ role=turn["role"],`
	`126`	`+ content=Message(turn["content"]),`
	`127`	`+ )`
	`128`	`+ for turn in self._create_conv(`
	`129`	`+ get_evaluator_prompt(prompt, response)`
`129`	`130`	`)`
`130`	`131`	`]`
`131`	`132`	`)`
`@@ -142,13 +143,12 @@ def on_topic_score(self, attempt_list):`
`142`	`143`	`Conversation(`
`143`	`144`	`[`
`144`	`145`	`Turn(`
`145`		`- "user",`
`146`		`- Message(`
`147`		`- self._create_conv(`
`148`		`- get_evaluator_prompt_on_topic(prompt),`
`149`		`- system_prompt=self.system_prompt_on_topic,`
`150`		`- )`
`151`		`- ),`
	`146`	`+ role=turn["role"],`
	`147`	`+ content=Message(turn["content"]),`
	`148`	`+ )`
	`149`	`+ for turn in self._create_conv(`
	`150`	`+ get_evaluator_prompt_on_topic(prompt),`
	`151`	`+ system_prompt=self.system_prompt_on_topic,`
`152`	`152`	`)`
`153`	`153`	`]`
`154`	`154`	`)`