From a3b3ad6bdd3d36617862359d59721516e518d926 Mon Sep 17 00:00:00 2001
From: Olivier Cervello <ocervello@freelabz.com>
Date: Wed, 1 Jul 2026 19:08:43 +0200
Subject: [PATCH] fix(ai): honor exploit mode in detection + document it (D2)

`_detect_mode` accepted only ("attack","chat") from the intent LLM and
discarded an "exploit" classification (fell back to old_mode/chat), even
though the full exploit mode exists (MODES entry, SYSTEM_EXPLOIT prompt,
get_system_prompt branch, _selection.txt classifies it). So exploit mode
was unreachable by detection and the `mode` opt help omitted it.

- _detect_mode: accept any mode in MODES (incl. exploit); attack/chat
  unchanged.
- `mode` opt help derived from MODES.keys() (DRY, no drift).
- Tests: LLM "exploit" verdict now sets mode=exploit; attack/chat/unknown
  unchanged; exploit system prompt renders with no leftover template vars.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 secator/tasks/ai.py             |  6 ++--
 tests/unit/test_ai_prompts.py   | 20 ++++++++++++
 tests/unit/test_ai_task_opts.py | 54 +++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/secator/tasks/ai.py b/secator/tasks/ai.py
index b750ad114..bf5e86226 100644
--- a/secator/tasks/ai.py
+++ b/secator/tasks/ai.py
@@ -22,7 +22,7 @@
 from secator.ai.encryption import SensitiveDataEncryptor, maybe_encrypt
 from secator.ai.history import ChatHistory, truncate_to_tokens, get_context_window
 from secator.ai.prompts import (
-	load_prompt, get_system_prompt, get_mode_config, format_tool_result, format_continue
+	load_prompt, get_system_prompt, get_mode_config, format_tool_result, format_continue, MODES
 )
 from secator.ai.tools import build_tool_schemas, tool_call_to_action, TOOL_SCHEMAS
 from secator.ai.session import save_history, show_session_picker, replay_session, restore_history_from_db
@@ -71,7 +71,7 @@ class ai(PythonRunner):
 	opts = {
 		"name": {"type": str, "default": "", "short": "n", "internal_name": "session_name", "help": "Name for the AI session or subagent"},  # noqa: E501
 		"prompt": {"type": str, "default": "", "short": "p", "help": "Prompt"},
-		"mode": {"type": str, "default": "", "help": "Mode: attack or chat"},
+		"mode": {"type": str, "default": "", "help": f"Mode: {', '.join(MODES)}"},  # D2: derive from MODES, don't drift
 		"model": {"type": str, "default": CONFIG.addons.ai.default_model, "help": "LLM model"},
 		# Never set a secret/CONFIG value as a task-option `default`: secator-api
 		# serves task opts (including defaults) to the UI, so a CONFIG default
@@ -761,7 +761,7 @@ def _detect_mode(self, force=False):
 					result = call_llm(messages, self.intent_model, temperature=0.3, api_base=self.api_base, api_key=self.api_key)  # noqa: E501
 				self._account_usage(result.get("usage"))
 				mode = result["content"].strip().lower()
-				if mode in ("attack", "chat"):
+				if mode in MODES:  # D2: honor any real mode (incl. exploit), don't discard it
 					console.print(rf"[bold green]\[INF][/] Detected intent: [bold]{mode}[/]")
 					self.mode = mode
 				else:
diff --git a/tests/unit/test_ai_prompts.py b/tests/unit/test_ai_prompts.py
index 5b63e349b..e05513c1f 100644
--- a/tests/unit/test_ai_prompts.py
+++ b/tests/unit/test_ai_prompts.py
@@ -101,6 +101,26 @@ def test_get_system_prompt_exploit(self):
 		self.assertIn("exploitation verification specialist", prompt)
 		self.assertIn("proof-of-concept", prompt)
 
+	def test_get_system_prompt_exploit_no_leftover_placeholders(self):
+		"""D2: exploit renders fully — no unresolved ${include} or template $vars.
+
+		(Literal Mongo operators like $in/$regex and example secrets like $API_KEY
+		are content, not Template vars, so we check the template names explicitly.)
+		"""
+		import re
+		prompt = get_system_prompt("exploit")
+		# All ${include} directives resolved (load_prompt) and $var substitutions done.
+		self.assertEqual(re.findall(r'\$\{\w+\}', prompt), [], "unresolved ${include} in exploit prompt")
+		template_vars = [
+			"library_reference", "discovery", "common", "queries", "findings",
+			"arsenal", "guardrails", "isolation", "exploitation_report",
+			"workspace_path", "query_types", "output_types_reference",
+		]
+		leftover = [v for v in template_vars if f"${v}" in prompt]
+		self.assertEqual(leftover, [], f"unresolved template vars in exploit prompt: {leftover}")
+		# uses the exploit template, not attack/chat
+		self.assertIn("exploitation verification specialist", prompt)
+
 	def test_get_system_prompt_attack_has_library_reference(self):
 		prompt = get_system_prompt("attack")
 		self.assertIn('<tasks>', prompt)
diff --git a/tests/unit/test_ai_task_opts.py b/tests/unit/test_ai_task_opts.py
index 518dec89b..401c0a0ec 100644
--- a/tests/unit/test_ai_task_opts.py
+++ b/tests/unit/test_ai_task_opts.py
@@ -1,5 +1,6 @@
 """Tests for AI task subagent opts."""
 import unittest
+from unittest.mock import MagicMock, patch
 
 from secator.definitions import ADDONS_ENABLED
 
@@ -25,6 +26,59 @@ def test_max_workers_opt_exists(self):
         self.assertEqual(ai.opts["max_workers"].get("default"), 3)
         self.assertTrue(ai.opts["max_workers"].get("internal", False))
 
+    def test_mode_opt_help_lists_all_modes(self):
+        """D2: the mode opt help documents every real mode (derived from MODES)."""
+        from secator.tasks.ai import ai
+        from secator.ai.prompts import MODES
+        help_text = ai.opts["mode"]["help"]
+        for mode in MODES:
+            self.assertIn(mode, help_text)
+        self.assertIn("exploit", help_text)  # the previously-omitted one
+
+
+@unittest.skipUnless(ADDONS_ENABLED['ai'], 'ai addon not installed')
+class TestDetectMode(unittest.TestCase):
+    """D2: _detect_mode must honor an LLM 'exploit' classification (was discarded)."""
+
+    def _make_task(self, prompt):
+        """Bare ai instance with just the attributes _detect_mode reads."""
+        from secator.tasks.ai import ai
+        t = ai.__new__(ai)
+        t.mode = ""            # no explicit mode -> detection runs
+        t.prompt = prompt
+        t.intent_model = "test-intent-model"
+        t.api_base = None
+        t.api_key = None
+        t.backend = MagicMock()
+        t.is_subagent = False
+        t.max_iterations = 10
+        t._account_usage = MagicMock()
+        return t
+
+    def _run_detect(self, prompt, llm_word):
+        """Force the LLM branch (ambiguous prompt) and stub call_llm's verdict."""
+        from secator.tasks.ai import ai
+        t = self._make_task(prompt)
+        with patch("secator.tasks.ai.call_llm", return_value={"content": llm_word, "usage": {}}), \
+             patch("secator.tasks.ai.get_system_prompt", return_value="sys"), \
+             patch("secator.tasks.ai.build_tool_schemas", return_value=[]), \
+             patch.object(ai, "reports_folder", "/tmp/ws"):
+            t._detect_mode()
+        return t.mode
+
+    def test_llm_exploit_classification_is_honored(self):
+        # ambiguous prompt -> defers to LLM; LLM says exploit -> mode is exploit (was 'chat')
+        self.assertEqual(self._run_detect("take a look at this thing", "exploit"), "exploit")
+
+    def test_llm_attack_classification_unchanged(self):
+        self.assertEqual(self._run_detect("take a look at this thing", "attack"), "attack")
+
+    def test_llm_chat_classification_unchanged(self):
+        self.assertEqual(self._run_detect("take a look at this thing", "chat"), "chat")
+
+    def test_llm_unknown_classification_falls_back_to_chat(self):
+        self.assertEqual(self._run_detect("take a look at this thing", "banana"), "chat")
+
 
 if __name__ == '__main__':
     unittest.main()