zai-org · wang0618 · Mar 14, 2026
diff --git a/phone_agent/model/client.py b/phone_agent/model/client.py
@@ -143,7 +143,8 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse:
         total_time = time.time() - start_time
 
         # Parse thinking and action from response
-        thinking, action = self._parse_response(raw_content)
+        thinking, action = parse_response(raw_content)
+        print(f"{thinking=}, {action=}")
 
         # Print performance metrics
         lang = self.config.lang
@@ -173,47 +174,48 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse:
             total_time=total_time,
         )
 
-    def _parse_response(self, content: str) -> tuple[str, str]:
-        """
-        Parse the model response into thinking and action parts.
 
-        Parsing rules:
-        1. If content contains 'finish(message=', everything before is thinking,
-           everything from 'finish(message=' onwards is action.
-        2. If rule 1 doesn't apply but content contains 'do(action=',
-           everything before is thinking, everything from 'do(action=' onwards is action.
-        3. Fallback: If content contains '<answer>', use legacy parsing with XML tags.
-        4. Otherwise, return empty thinking and full content as action.
+def parse_response(content: str) -> tuple[str, str]:
+    """
+    Parse the model response into thinking and action parts.
 
-        Args:
-            content: Raw response content.
+    Parsing rules:
+    1. If content contains '<answer>', parse by XML-like tags first.
+    2. If rule 1 doesn't apply but content contains 'finish(message=',
+        everything before is thinking, everything from 'finish(message=' onwards is action.
+    3. If rule 2 doesn't apply but content contains 'do(action=',
+        everything before is thinking, everything from 'do(action=' onwards is action.
+    4. Otherwise, return empty thinking and full content as action.
 
-        Returns:
-            Tuple of (thinking, action).
-        """
-        # Rule 1: Check for finish(message=
-        if "finish(message=" in content:
-            parts = content.split("finish(message=", 1)
-            thinking = parts[0].strip()
-            action = "finish(message=" + parts[1]
-            return thinking, action
-
-        # Rule 2: Check for do(action=
-        if "do(action=" in content:
-            parts = content.split("do(action=", 1)
-            thinking = parts[0].strip()
-            action = "do(action=" + parts[1]
-            return thinking, action
-
-        # Rule 3: Fallback to legacy XML tag parsing
-        if "<answer>" in content:
-            parts = content.split("<answer>", 1)
-            thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
-            action = parts[1].replace("</answer>", "").strip()
-            return thinking, action
-
-        # Rule 4: No markers found, return content as action
-        return "", content
+    Args:
+        content: Raw response content.
+
+    Returns:
+        Tuple of (thinking, action).
+    """
+    # Rule 1: Prefer XML-like tag parsing when answer tags are present.
+    if "<answer>" in content and "</answer>" in content:
+        parts = content.split("<answer>", 1)
+        thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
+        action = parts[1].split("</answer>", 1)[0].strip()
+        return thinking, action
+
+    # Rule 2: Check for finish(message=
+    if "finish(message=" in content:
+        parts = content.split("finish(message=", 1)
+        thinking = parts[0].strip()
+        action = "finish(message=" + parts[1]
+        return thinking, action
+
+    # Rule 3: Check for do(action=
+    if "do(action=" in content:
+        parts = content.split("do(action=", 1)
+        thinking = parts[0].strip()
+        action = "do(action=" + parts[1]
+        return thinking, action
+
+    # Rule 4: No markers found, return content as action
+    return "", content
 
 
 class MessageBuilder:

diff --git a/tests/test_parse_response.py b/tests/test_parse_response.py
@@ -0,0 +1,49 @@
+from phone_agent.model.client import parse_response
+
+
+def test_parse_response() -> None:
+    content = """<think>先思考</think>
+<answer>
+do(action="Launch", app="知乎")
+</answer>"""
+    thinking, action = parse_response(content)
+    assert thinking == "先思考"
+    assert action == 'do(action="Launch", app="知乎")'
+
+    content = """<think>用户需要打开知乎查看首页第一条消息，当前在系统桌面，首先需要启动知乎应用，因此执行Launch操作打开知乎。</think>
+<answer>
+do(action="Launch", app="知乎")
+</answer>"""
+    thinking, action = parse_response(content)
+    assert (
+        thinking
+        == "用户需要打开知乎查看首页第一条消息，当前在系统桌面，首先需要启动知乎应用，因此执行Launch操作打开知乎。"
+    )
+    assert action == 'do(action="Launch", app="知乎")'
+
+    content = '先总结一下\nfinish(message="任务完成")'
+    thinking, action = parse_response(content)
+    assert thinking == "先总结一下"
+    assert action == 'finish(message="任务完成")'
+
+    content = '先分析页面元素\ndo(action="Tap", element=[120,240])'
+    thinking, action = parse_response(content)
+    assert thinking == "先分析页面元素"
+    assert action == 'do(action="Tap", element=[120,240])'
+
+    content = '直接输出动作 do(action="Back")'
+    thinking, action = parse_response(content)
+    assert thinking == "直接输出动作"
+    assert action == 'do(action="Back")'
+
+    content = "这是一段没有动作标记的普通文本"
+    thinking, action = parse_response(content)
+    assert thinking == ""
+    assert action == content
+
+    content = """<think>先思考</think>
+<answer>
+do(action="Launch", app="知乎")"""
+    thinking, action = parse_response(content)
+    assert thinking == "<think>先思考</think>\n<answer>"
+    assert action == 'do(action="Launch", app="知乎")'