From a3b3ad6bdd3d36617862359d59721516e518d926 Mon Sep 17 00:00:00 2001 From: Olivier Cervello Date: Wed, 1 Jul 2026 19:08:43 +0200 Subject: [PATCH] fix(ai): honor exploit mode in detection + document it (D2) `_detect_mode` accepted only ("attack","chat") from the intent LLM and discarded an "exploit" classification (fell back to old_mode/chat), even though the full exploit mode exists (MODES entry, SYSTEM_EXPLOIT prompt, get_system_prompt branch, _selection.txt classifies it). So exploit mode was unreachable by detection and the `mode` opt help omitted it. - _detect_mode: accept any mode in MODES (incl. exploit); attack/chat unchanged. - `mode` opt help derived from MODES.keys() (DRY, no drift). - Tests: LLM "exploit" verdict now sets mode=exploit; attack/chat/unknown unchanged; exploit system prompt renders with no leftover template vars. Co-Authored-By: Claude Opus 4.8 --- secator/tasks/ai.py | 6 ++-- tests/unit/test_ai_prompts.py | 20 ++++++++++++ tests/unit/test_ai_task_opts.py | 54 +++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/secator/tasks/ai.py b/secator/tasks/ai.py index b750ad114..bf5e86226 100644 --- a/secator/tasks/ai.py +++ b/secator/tasks/ai.py @@ -22,7 +22,7 @@ from secator.ai.encryption import SensitiveDataEncryptor, maybe_encrypt from secator.ai.history import ChatHistory, truncate_to_tokens, get_context_window from secator.ai.prompts import ( - load_prompt, get_system_prompt, get_mode_config, format_tool_result, format_continue + load_prompt, get_system_prompt, get_mode_config, format_tool_result, format_continue, MODES ) from secator.ai.tools import build_tool_schemas, tool_call_to_action, TOOL_SCHEMAS from secator.ai.session import save_history, show_session_picker, replay_session, restore_history_from_db @@ -71,7 +71,7 @@ class ai(PythonRunner): opts = { "name": {"type": str, "default": "", "short": "n", "internal_name": "session_name", "help": "Name for the AI session or subagent"}, # noqa: E501 "prompt": {"type": str, "default": "", "short": "p", "help": "Prompt"}, - "mode": {"type": str, "default": "", "help": "Mode: attack or chat"}, + "mode": {"type": str, "default": "", "help": f"Mode: {', '.join(MODES)}"}, # D2: derive from MODES, don't drift "model": {"type": str, "default": CONFIG.addons.ai.default_model, "help": "LLM model"}, # Never set a secret/CONFIG value as a task-option `default`: secator-api # serves task opts (including defaults) to the UI, so a CONFIG default @@ -761,7 +761,7 @@ def _detect_mode(self, force=False): result = call_llm(messages, self.intent_model, temperature=0.3, api_base=self.api_base, api_key=self.api_key) # noqa: E501 self._account_usage(result.get("usage")) mode = result["content"].strip().lower() - if mode in ("attack", "chat"): + if mode in MODES: # D2: honor any real mode (incl. exploit), don't discard it console.print(rf"[bold green]\[INF][/] Detected intent: [bold]{mode}[/]") self.mode = mode else: diff --git a/tests/unit/test_ai_prompts.py b/tests/unit/test_ai_prompts.py index 5b63e349b..e05513c1f 100644 --- a/tests/unit/test_ai_prompts.py +++ b/tests/unit/test_ai_prompts.py @@ -101,6 +101,26 @@ def test_get_system_prompt_exploit(self): self.assertIn("exploitation verification specialist", prompt) self.assertIn("proof-of-concept", prompt) + def test_get_system_prompt_exploit_no_leftover_placeholders(self): + """D2: exploit renders fully — no unresolved ${include} or template $vars. + + (Literal Mongo operators like $in/$regex and example secrets like $API_KEY + are content, not Template vars, so we check the template names explicitly.) + """ + import re + prompt = get_system_prompt("exploit") + # All ${include} directives resolved (load_prompt) and $var substitutions done. + self.assertEqual(re.findall(r'\$\{\w+\}', prompt), [], "unresolved ${include} in exploit prompt") + template_vars = [ + "library_reference", "discovery", "common", "queries", "findings", + "arsenal", "guardrails", "isolation", "exploitation_report", + "workspace_path", "query_types", "output_types_reference", + ] + leftover = [v for v in template_vars if f"${v}" in prompt] + self.assertEqual(leftover, [], f"unresolved template vars in exploit prompt: {leftover}") + # uses the exploit template, not attack/chat + self.assertIn("exploitation verification specialist", prompt) + def test_get_system_prompt_attack_has_library_reference(self): prompt = get_system_prompt("attack") self.assertIn('', prompt) diff --git a/tests/unit/test_ai_task_opts.py b/tests/unit/test_ai_task_opts.py index 518dec89b..401c0a0ec 100644 --- a/tests/unit/test_ai_task_opts.py +++ b/tests/unit/test_ai_task_opts.py @@ -1,5 +1,6 @@ """Tests for AI task subagent opts.""" import unittest +from unittest.mock import MagicMock, patch from secator.definitions import ADDONS_ENABLED @@ -25,6 +26,59 @@ def test_max_workers_opt_exists(self): self.assertEqual(ai.opts["max_workers"].get("default"), 3) self.assertTrue(ai.opts["max_workers"].get("internal", False)) + def test_mode_opt_help_lists_all_modes(self): + """D2: the mode opt help documents every real mode (derived from MODES).""" + from secator.tasks.ai import ai + from secator.ai.prompts import MODES + help_text = ai.opts["mode"]["help"] + for mode in MODES: + self.assertIn(mode, help_text) + self.assertIn("exploit", help_text) # the previously-omitted one + + +@unittest.skipUnless(ADDONS_ENABLED['ai'], 'ai addon not installed') +class TestDetectMode(unittest.TestCase): + """D2: _detect_mode must honor an LLM 'exploit' classification (was discarded).""" + + def _make_task(self, prompt): + """Bare ai instance with just the attributes _detect_mode reads.""" + from secator.tasks.ai import ai + t = ai.__new__(ai) + t.mode = "" # no explicit mode -> detection runs + t.prompt = prompt + t.intent_model = "test-intent-model" + t.api_base = None + t.api_key = None + t.backend = MagicMock() + t.is_subagent = False + t.max_iterations = 10 + t._account_usage = MagicMock() + return t + + def _run_detect(self, prompt, llm_word): + """Force the LLM branch (ambiguous prompt) and stub call_llm's verdict.""" + from secator.tasks.ai import ai + t = self._make_task(prompt) + with patch("secator.tasks.ai.call_llm", return_value={"content": llm_word, "usage": {}}), \ + patch("secator.tasks.ai.get_system_prompt", return_value="sys"), \ + patch("secator.tasks.ai.build_tool_schemas", return_value=[]), \ + patch.object(ai, "reports_folder", "/tmp/ws"): + t._detect_mode() + return t.mode + + def test_llm_exploit_classification_is_honored(self): + # ambiguous prompt -> defers to LLM; LLM says exploit -> mode is exploit (was 'chat') + self.assertEqual(self._run_detect("take a look at this thing", "exploit"), "exploit") + + def test_llm_attack_classification_unchanged(self): + self.assertEqual(self._run_detect("take a look at this thing", "attack"), "attack") + + def test_llm_chat_classification_unchanged(self): + self.assertEqual(self._run_detect("take a look at this thing", "chat"), "chat") + + def test_llm_unknown_classification_falls_back_to_chat(self): + self.assertEqual(self._run_detect("take a look at this thing", "banana"), "chat") + if __name__ == '__main__': unittest.main()