microsoft · alzhang-git · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026 · Mar 1, 2026
diff --git a/amplifier_module_provider_github_copilot/converters.py b/amplifier_module_provider_github_copilot/converters.py
@@ -93,8 +93,29 @@ def convert_messages_to_prompt(messages: list[dict[str, Any]]) -> str:
             assistant_text = content
             tool_calls = msg.get("tool_calls", [])
 
+            # If tool_calls key is missing/empty but content blocks contain
+            # tool_call/tool_use entries, extract them so conversation history
+            # is correctly serialized (prevents lost tool context on replay).
+            if not tool_calls and isinstance(msg.get("content"), list):
+                for block in msg["content"]:
+                    if isinstance(block, dict) and block.get("type") in (
+                        "tool_call",
+                        "tool_use",
+                    ):
+                        tool_calls.append(
+                            {
+                                "name": block.get("name", "unknown"),
+                                "arguments": block.get(
+                                    "input", block.get("arguments", {})
+                                ),
+                                "id": block.get("id", ""),
+                            }
+                        )
+
             if tool_calls:
-                # Include tool call information
+                # Include tool call history using XML tags that are clearly marked as
+                # past actions. Do NOT use [Tool Call: ...] format — the model mimics it
+                # and writes fake tool calls as text instead of using structured calling.
                 tool_parts = []
                 for tc in tool_calls:
                     tool_name = tc.get("name", tc.get("function", {}).get("name", "unknown"))
@@ -104,7 +125,9 @@ def convert_messages_to_prompt(messages: list[dict[str, Any]]) -> str:
                             tool_args = json.loads(tool_args)
                         except json.JSONDecodeError:
                             pass
-                    tool_parts.append(f"[Tool Call: {tool_name}({json.dumps(tool_args)})]")
+                    tool_parts.append(
+                        f"<tool_used name=\"{tool_name}\">{json.dumps(tool_args)}</tool_used>"
+                    )
                 if assistant_text:
                     parts.append(f"Assistant: {assistant_text}\n" + "\n".join(tool_parts))
                 else:
@@ -114,12 +137,12 @@ def convert_messages_to_prompt(messages: list[dict[str, Any]]) -> str:
         elif role == "tool":
             # Tool results from Amplifier's tool execution
             tool_name = msg.get("tool_name", msg.get("name", "tool"))
-            parts.append(f"Tool Result ({tool_name}): {content}")
+            parts.append(f"<tool_result name=\"{tool_name}\">{content}</tool_result>")
         elif role == "function":
             # Legacy function role (deprecated by OpenAI in favor of 'tool')
             # Handle as alias for tool result for backward compatibility
             func_name = msg.get("name", "function")
-            parts.append(f"Tool Result ({func_name}): {content}")
+            parts.append(f"<tool_result name=\"{func_name}\">{content}</tool_result>")
         else:
             # Unknown role, treat as user
             logger.warning(f"[CONVERTER] Unknown message role: {role}")
@@ -155,13 +178,21 @@ def _extract_content(msg: dict[str, Any]) -> str:
             if isinstance(block, str):
                 text_parts.append(block)
             elif isinstance(block, dict):
-                if block.get("type") == "text":
+                block_type = block.get("type", "")
+                if block_type in ("tool_call", "tool_use", "tool_result"):
+                    # Skip tool call/result blocks — they are not text content
+                    # and must not leak into the serialized prompt
+                    continue
+                if block_type == "text":
                     text_parts.append(block.get("text", ""))
-                elif block.get("type") == "image_url":
+                elif block_type == "image_url":
                     text_parts.append("[Image]")
+                elif block_type == "thinking":
+                    continue  # thinking blocks are not user-visible text
                 else:
-                    # Unknown block type
-                    text_parts.append(str(block.get("text", block.get("content", ""))))
+                    text_val = block.get("text", block.get("content", ""))
+                    if text_val:
+                        text_parts.append(str(text_val))
         return "\n".join(text_parts)
 
     # Fallback

diff --git a/amplifier_module_provider_github_copilot/provider.py b/amplifier_module_provider_github_copilot/provider.py
@@ -32,6 +32,7 @@
 
 import asyncio
 import logging
+import re
 import time
 from collections import OrderedDict
 from typing import Any
@@ -1010,6 +1011,74 @@ async def _on_retry(attempt: int, delay: float, error: KernelLLMError) -> None:
         response = await retry_with_backoff(_do_complete, self._retry_config, on_retry=_on_retry)
         elapsed_ms = int((time.time() - outer_start) * 1000)
 
+        # ── Fix 2: Defensive detection of fake tool calls ──────────────
+        # When the LLM writes tool calls as plain text instead of issuing
+        # structured tool_requests, the orchestrator would display fake
+        # results that were never actually executed.  Detect this and
+        # retry with a correction message (up to 2 times).
+        _FAKE_TOOL_CALL_RE = re.compile(
+            r"\[Tool Call:\s*\w+\("       # [Tool Call: name(
+            r"|Tool Result \(\w+\):"      # Tool Result (name):
+            r"|<tool_used\s+name="        # <tool_used name=  (XML format mimicked)
+        )
+        _MAX_FAKE_TC_RETRIES = 2
+
+        if request_tools and not response.tool_calls:
+            # Extract all text from content blocks
+            response_text = ""
+            for block in response.content or []:
+                if hasattr(block, "text"):
+                    response_text += block.text
+
+            fake_retry = 0
+            while (
+                _FAKE_TOOL_CALL_RE.search(response_text)
+                and fake_retry < _MAX_FAKE_TC_RETRIES
+            ):
+                fake_retry += 1
+                logger.warning(
+                    f"[PROVIDER] Detected fake tool call text in response "
+                    f"(retry {fake_retry}/{_MAX_FAKE_TC_RETRIES}). "
+                    f"Re-prompting LLM to use structured tool calls."
+                )
+                await self._emit_event(
+                    "provider:fake_tool_retry",
+                    {
+                        "provider": self.name,
+                        "model": model,
+                        "retry": fake_retry,
+                    },
+                )
+
+                # Append a correction hint to the messages and re-complete
+                correction_msg = {
+                    "role": "user",
+                    "content": (
+                        "IMPORTANT: You just wrote tool calls as plain text "
+                        "instead of invoking them. That text was discarded. "
+                        "You MUST use the structured tool calling mechanism "
+                        "provided by the system — do NOT write tool names, "
+                        "arguments, or results as text. Retry now using real "
+                        "tool calls."
+                    ),
+                }
+                messages.append(correction_msg)
+                prompt = convert_messages_to_prompt(messages)
+
+                response = await retry_with_backoff(
+                    _do_complete, self._retry_config, on_retry=_on_retry
+                )
+
+                # Re-check text
+                if response.tool_calls:
+                    break
+                response_text = ""
+                for block in response.content or []:
+                    if hasattr(block, "text"):
+                        response_text += block.text
+
+            elapsed_ms = int((time.time() - outer_start) * 1000)
+
         if self._debug:
             content_preview = self._truncate(str(response.content))
             logger.debug(f"[PROVIDER] Response content: {content_preview}")

diff --git a/tests/integration/test_multi_model_saturation.py b/tests/integration/test_multi_model_saturation.py
@@ -281,7 +281,7 @@ async def run_scenario(model: str, turns: int, prompt: str, tag: str) -> dict[st
 
         # Check how many tool call patterns exist in the serialized prompt
         serialized_prompt = convert_messages_to_prompt(messages)
-        tc_text_count = len(re.findall(r'\[Tool Call:', serialized_prompt))
+        tc_text_count = len(re.findall(r'<tool_used\s+name=', serialized_prompt))
 
         request = Mock()
         request.messages = messages

diff --git a/tests/test_converters.py b/tests/test_converters.py
@@ -77,7 +77,7 @@ def test_tool_result_message(self):
             }
         ]
         result = convert_messages_to_prompt(messages)
-        assert "Tool Result (read_file): file contents here" in result
+        assert '<tool_result name="read_file">file contents here</tool_result>' in result
 
     def test_assistant_with_tool_calls(self):
         """Should include tool call information in assistant message."""
@@ -98,7 +98,7 @@ def test_assistant_with_tool_calls(self):
 
         assert "Assistant:" in result
         assert "Let me check that." in result
-        assert "[Tool Call: read_file" in result
+        assert '<tool_used name="read_file">' in result
 
     def test_list_content_blocks(self):
         """Should handle OpenAI-style list content blocks."""
@@ -286,7 +286,7 @@ def test_tool_call_arguments_as_json_string(self):
         ]
         result = convert_messages_to_prompt(messages)
 
-        assert "[Tool Call: search" in result
+        assert '<tool_used name="search">' in result
         # Arguments should be parsed and re-serialized
         assert "python" in result
 
@@ -308,7 +308,7 @@ def test_tool_call_arguments_invalid_json_string(self):
         result = convert_messages_to_prompt(messages)
 
         # Should not crash, include the tool call somehow
-        assert "[Tool Call: custom" in result
+        assert '<tool_used name="custom">' in result
 
     def test_assistant_with_tool_calls_no_content(self):
         """Should handle assistant message with tool calls but no text."""
@@ -328,7 +328,7 @@ def test_assistant_with_tool_calls_no_content(self):
         result = convert_messages_to_prompt(messages)
 
         assert "Assistant:" in result
-        assert "[Tool Call: read_file" in result
+        assert '<tool_used name="read_file">' in result
 
     def test_unknown_role_message(self):
         """Should handle unknown message roles gracefully."""
@@ -381,7 +381,7 @@ def test_tool_name_fallback(self):
         ]
         result = convert_messages_to_prompt(messages)
 
-        assert "Tool Result" in result
+        assert "<tool_result" in result
         assert "result content" in result
 
     def test_tool_call_with_function_format(self):
@@ -403,7 +403,7 @@ def test_tool_call_with_function_format(self):
         ]
         result = convert_messages_to_prompt(messages)
 
-        assert "[Tool Call: get_weather" in result
+        assert '<tool_used name="get_weather">' in result
         assert "Seattle" in result
 
 
@@ -553,7 +553,7 @@ def test_function_message_formatted_as_tool_result(self):
         result = convert_messages_to_prompt(messages)
 
         # Should be formatted as tool result for consistency
-        assert "Tool Result (get_weather):" in result
+        assert '<tool_result name="get_weather">' in result
         assert "72 degrees, sunny" in result
 
     def test_function_message_no_name_fallback(self):
@@ -562,7 +562,7 @@ def test_function_message_no_name_fallback(self):
         result = convert_messages_to_prompt(messages)
 
         # Should use 'function' as fallback name
-        assert "Tool Result (function):" in result
+        assert '<tool_result name="function">' in result
         assert "result content" in result
 
     def test_mixed_tool_and_function_roles(self):
@@ -574,8 +574,8 @@ def test_mixed_tool_and_function_roles(self):
         result = convert_messages_to_prompt(messages)
 
         # Both should appear as Tool Results
-        assert "Tool Result (read_file): file A contents" in result
-        assert "Tool Result (write_file): file B written" in result
+        assert '<tool_result name="read_file">file A contents</tool_result>' in result
+        assert '<tool_result name="write_file">file B written</tool_result>' in result
 
 
 class TestAllSixRolesIntegration:
@@ -612,10 +612,10 @@ def test_all_six_roles_in_conversation(self):
 
         # Assistant with tool call
         assert "Assistant:" in result
-        assert "[Tool Call: get_time" in result
+        assert '<tool_used name="get_time">' in result
 
         # Tool result
-        assert "Tool Result (get_time): 10:30 AM" in result
+        assert '<tool_result name="get_time">10:30 AM</tool_result>' in result
 
         # Function as legacy tool result
-        assert "Tool Result (legacy_func): legacy result" in result
+        assert '<tool_result name="legacy_func">legacy result</tool_result>' in result