From f2c5e21f69c892b233010d992bf0dd83f6a262c4 Mon Sep 17 00:00:00 2001 From: Mickael Farina Date: Sun, 3 May 2026 22:49:39 +0200 Subject: [PATCH] hotfix: retry on path/read_path/domain hallucinations too (PR #35) PR #34 only retried `skill_not_authorized`. Real-world Qwen drift hits `domain_not_authorized` and path violations just as often (e.g. plan allows api.exchangerate-api.com, model emits bare exchangerate-api.com). Refactor the retry block in _execute_checkpoint to dispatch on all four PermissionViolation reasons via _build_correction_nudge(): - skill_not_authorized -> list allowed skills - path_not_authorized -> list allowed write_paths globs - read_path_not_authorized -> list allowed read_paths globs - domain_not_authorized -> list allowed network_domains Each nudge appended to history with _skill_correction_nudge marker so _qwen_next_action sees the corrected closed-world allowlist on the retry. SECOND consecutive miss still raises -> blocked_on_permission. Tests (4 total in this slice, 46 total in test_agent_runner.py): - existing skill retry test still green - domain retry test (forex anchor scenario) - write_path retry test - read_path retry test Co-Authored-By: Claude Opus 4.7 (1M context) --- codec_agent_runner.py | 98 ++++++++++++++++++-------- tests/test_agent_runner.py | 137 +++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+), 29 deletions(-) diff --git a/codec_agent_runner.py b/codec_agent_runner.py index 16a5bd1..fdb8fd8 100644 --- a/codec_agent_runner.py +++ b/codec_agent_runner.py @@ -322,6 +322,52 @@ class StepBudgetExhausted(Exception): """Per-checkpoint step budget cap reached without checkpoint_done.""" +def _build_correction_nudge(pv: "PermissionViolation", + action: Action, + agent_grants: Dict[str, Any], + global_grants: Dict[str, Any]) -> Optional[str]: + """PR #35: build a single-shot correction string for the LLM when + it picks something outside the allowlist. Returns None for unknown + reasons (caller falls back to raise). + + The string is appended to history.result so the next + _qwen_next_action call sees it as the most-recent step output, and + the model corrects itself instead of looping. We list the FULL + allowed set so the model has a closed-world choice — listing + nothing was the original PR #34 bug for skills; same logic applies + to paths and domains.""" + reason = pv.reason + if reason == "skill_not_authorized": + allowed = sorted(set(agent_grants.get("skills", [])) | + set(global_grants.get("skills", []))) + return (f"") + if reason == "path_not_authorized": + allowed = sorted(set(agent_grants.get("write_paths", [])) | + set(global_grants.get("write_paths", []))) + return (f"") + if reason == "read_path_not_authorized": + allowed = sorted(set(agent_grants.get("read_paths", [])) | + set(global_grants.get("read_paths", []))) + return (f"") + if reason == "domain_not_authorized": + allowed = sorted(set(agent_grants.get("network_domains", [])) | + set(global_grants.get("network_domains", []))) + return (f"") + return None + + def _run_skill(skill_name: str, task: str, agent_id: str) -> str: """Lazy-imported codec_dispatch.run_skill. Step 1+2 hooks fire automatically inside run_skill via run_with_hooks.""" @@ -365,39 +411,33 @@ def _execute_checkpoint(plan_dict: Dict[str, Any], return history # Permission gate (raises PermissionViolation if outside manifest). - # Phase 3.5 hotfix: if the LLM hallucinates a skill name (e.g. - # "fetch_url" instead of the real "web_fetch"), give it ONE retry - # with the corrected skill list as context. Most skill-hallucination - # errors recover with a single correction pass; only block on the - # SECOND consecutive miss. This dramatically reduces user-visible + # Phase 3.5 hotfix (PR #34 + #35): if the LLM hallucinates a skill + # name, write path, read path, or network domain, give it ONE retry + # with the corrected allowlist as context. Most hallucinations + # recover with a single correction pass; only block on the SECOND + # consecutive miss. This dramatically reduces user-visible # blocked_on_permission events caused by LLM naming drift. try: permission_gate(action, agent_grants, global_grants) except PermissionViolation as pv: - if pv.reason == "skill_not_authorized": - # Append the failed action to history with a correction nudge - # so the next _qwen_next_action call sees the error context. - allowed = sorted(set(agent_grants.get("skills", [])) | - set(global_grants.get("skills", []))) - history.append({ - "step": len(history), - "skill": action.skill, - "task": action.task[:200], - "result": (f""), - "is_destructive": False, - "_skill_correction_nudge": True, - }) - # Re-call Qwen — if it still picks the wrong skill, fall through - # and the SECOND permission_gate call will raise normally. - action2 = _qwen_next_action(plan_dict, checkpoint, history) - if action2.kind == "checkpoint_done": - return history - permission_gate(action2, agent_grants, global_grants) - action = action2 # use the corrected action going forward - else: - raise # path / domain violations not auto-recoverable + nudge = _build_correction_nudge(pv, action, agent_grants, global_grants) + if nudge is None: + raise # unknown reason — fall through unchanged + history.append({ + "step": len(history), + "skill": action.skill, + "task": action.task[:200], + "result": nudge, + "is_destructive": False, + "_skill_correction_nudge": True, + }) + # Re-call Qwen — if it still picks something invalid, fall through + # and the SECOND permission_gate call will raise normally. + action2 = _qwen_next_action(plan_dict, checkpoint, history) + if action2.kind == "checkpoint_done": + return history + permission_gate(action2, agent_grants, global_grants) + action = action2 # use the corrected action going forward # Destructive gate (raises DestructiveOpRejected on user reject) if action.is_destructive: diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py index 9c1dd15..bf2a8a5 100644 --- a/tests/test_agent_runner.py +++ b/tests/test_agent_runner.py @@ -995,3 +995,140 @@ def fake_next(*a, **k): nudges = [h for h in history if h.get("_skill_correction_nudge")] assert len(nudges) == 1 assert "fetch_url" in nudges[0]["skill"] + + +# ───────────────────────────────────────────────────────────────────────────── +# PR #35 — extend hallucination retry to path / read_path / domain (3 tests) +# ───────────────────────────────────────────────────────────────────────────── + +def test_domain_hallucination_retries_with_corrected_domain_list(monkeypatch, temp_codec_dir): + """When LLM picks an unauthorized network_domain (e.g. bare host instead + of the api.* one in the manifest), runner appends a correction nudge + listing allowed domains and re-calls Qwen. The corrected call runs.""" + import codec_agent_runner as car + + grants = {"skills": ["web_fetch"], "read_paths": [], "write_paths": [], + "network_domains": ["api.exchangerate-api.com"]} + global_grants = {"schema": 1, "version": 0, + "skills": [], "read_paths": [], "write_paths": [], "network_domains": []} + checkpoint = {"id": "cp1", "title": "t", "description": "d", + "expected_output": "o", "step_budget": 5} + + actions = [ + car.Action(skill="web_fetch", task="get rates", kind="skill_call", + is_destructive=False, network_call=True, + network_domain="exchangerate-api.com"), # bare host — not in manifest + car.Action(skill="web_fetch", task="get rates", kind="skill_call", + is_destructive=False, network_call=True, + network_domain="api.exchangerate-api.com"), # corrected + car.Action(skill="", task="", kind="checkpoint_done"), + ] + idx = {"n": 0} + def fake_next(*a, **k): + out = actions[idx["n"]] + idx["n"] += 1 + return out + monkeypatch.setattr(car, "_qwen_next_action", fake_next) + fake_run = MagicMock(return_value="USD/EUR=0.92") + monkeypatch.setattr(car, "_run_skill", fake_run) + + history = car._execute_checkpoint( + plan_dict={"goals": ["g"]}, checkpoint=checkpoint, + agent_grants=grants, global_grants=global_grants, + agent_id="agent_test", + ) + + # Corrected web_fetch ran exactly once; bare-host attempt was rejected pre-execution + fake_run.assert_called_once_with("web_fetch", "get rates", "agent_test") + nudges = [h for h in history if h.get("_skill_correction_nudge")] + assert len(nudges) == 1 + assert "domain_error" in nudges[0]["result"] + assert "api.exchangerate-api.com" in nudges[0]["result"] + + +def test_write_path_hallucination_retries_with_corrected_path_list(monkeypatch, temp_codec_dir): + """When LLM picks an unauthorized write path, runner appends a + correction nudge listing allowed write_paths globs and re-calls Qwen.""" + import codec_agent_runner as car + + grants = {"skills": ["file_write"], + "read_paths": [], "write_paths": ["~/Documents/agents/**"], + "network_domains": []} + global_grants = {"schema": 1, "version": 0, + "skills": [], "read_paths": [], "write_paths": [], "network_domains": []} + checkpoint = {"id": "cp1", "title": "t", "description": "d", + "expected_output": "o", "step_budget": 5} + + actions = [ + car.Action(skill="file_write", task="write report", kind="skill_call", + is_destructive=False, touches_path=True, + path="/tmp/report.md"), # outside glob + car.Action(skill="file_write", task="write report", kind="skill_call", + is_destructive=False, touches_path=True, + path="~/Documents/agents/forex/report.md"), # corrected + car.Action(skill="", task="", kind="checkpoint_done"), + ] + idx = {"n": 0} + def fake_next(*a, **k): + out = actions[idx["n"]] + idx["n"] += 1 + return out + monkeypatch.setattr(car, "_qwen_next_action", fake_next) + fake_run = MagicMock(return_value="ok") + monkeypatch.setattr(car, "_run_skill", fake_run) + + history = car._execute_checkpoint( + plan_dict={"goals": ["g"]}, checkpoint=checkpoint, + agent_grants=grants, global_grants=global_grants, + agent_id="agent_test", + ) + + fake_run.assert_called_once_with("file_write", "write report", "agent_test") + nudges = [h for h in history if h.get("_skill_correction_nudge")] + assert len(nudges) == 1 + assert "path_error" in nudges[0]["result"] + assert "~/Documents/agents/**" in nudges[0]["result"] + + +def test_read_path_hallucination_retries_with_corrected_path_list(monkeypatch, temp_codec_dir): + """When LLM picks an unauthorized read path, runner appends a + correction nudge listing allowed read_paths and re-calls Qwen.""" + import codec_agent_runner as car + + grants = {"skills": ["file_read"], + "read_paths": ["~/Documents/research/**"], + "write_paths": [], "network_domains": []} + global_grants = {"schema": 1, "version": 0, + "skills": [], "read_paths": [], "write_paths": [], "network_domains": []} + checkpoint = {"id": "cp1", "title": "t", "description": "d", + "expected_output": "o", "step_budget": 5} + + actions = [ + car.Action(skill="file_read", task="read", kind="skill_call", + is_destructive=False, reads_path=True, + read_path="/etc/passwd"), # outside allowlist + car.Action(skill="file_read", task="read", kind="skill_call", + is_destructive=False, reads_path=True, + read_path="~/Documents/research/notes.md"), # corrected + car.Action(skill="", task="", kind="checkpoint_done"), + ] + idx = {"n": 0} + def fake_next(*a, **k): + out = actions[idx["n"]] + idx["n"] += 1 + return out + monkeypatch.setattr(car, "_qwen_next_action", fake_next) + fake_run = MagicMock(return_value="content") + monkeypatch.setattr(car, "_run_skill", fake_run) + + history = car._execute_checkpoint( + plan_dict={"goals": ["g"]}, checkpoint=checkpoint, + agent_grants=grants, global_grants=global_grants, + agent_id="agent_test", + ) + + fake_run.assert_called_once_with("file_read", "read", "agent_test") + nudges = [h for h in history if h.get("_skill_correction_nudge")] + assert len(nudges) == 1 + assert "read_path_error" in nudges[0]["result"] + assert "~/Documents/research/**" in nudges[0]["result"]