Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 41 additions & 39 deletions phone_agent/model/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse:
total_time = time.time() - start_time

# Parse thinking and action from response
thinking, action = self._parse_response(raw_content)
thinking, action = parse_response(raw_content)
print(f"{thinking=}, {action=}")

# Print performance metrics
lang = self.config.lang
Expand Down Expand Up @@ -173,47 +174,48 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse:
total_time=total_time,
)

def _parse_response(self, content: str) -> tuple[str, str]:
"""
Parse the model response into thinking and action parts.

Parsing rules:
1. If content contains 'finish(message=', everything before is thinking,
everything from 'finish(message=' onwards is action.
2. If rule 1 doesn't apply but content contains 'do(action=',
everything before is thinking, everything from 'do(action=' onwards is action.
3. Fallback: If content contains '<answer>', use legacy parsing with XML tags.
4. Otherwise, return empty thinking and full content as action.
def parse_response(content: str) -> tuple[str, str]:
"""
Parse the model response into thinking and action parts.

Args:
content: Raw response content.
Parsing rules:
1. If content contains '<answer>', parse by XML-like tags first.
2. If rule 1 doesn't apply but content contains 'finish(message=',
everything before is thinking, everything from 'finish(message=' onwards is action.
3. If rule 2 doesn't apply but content contains 'do(action=',
everything before is thinking, everything from 'do(action=' onwards is action.
4. Otherwise, return empty thinking and full content as action.

Returns:
Tuple of (thinking, action).
"""
# Rule 1: Check for finish(message=
if "finish(message=" in content:
parts = content.split("finish(message=", 1)
thinking = parts[0].strip()
action = "finish(message=" + parts[1]
return thinking, action

# Rule 2: Check for do(action=
if "do(action=" in content:
parts = content.split("do(action=", 1)
thinking = parts[0].strip()
action = "do(action=" + parts[1]
return thinking, action

# Rule 3: Fallback to legacy XML tag parsing
if "<answer>" in content:
parts = content.split("<answer>", 1)
thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
action = parts[1].replace("</answer>", "").strip()
return thinking, action

# Rule 4: No markers found, return content as action
return "", content
Args:
content: Raw response content.

Returns:
Tuple of (thinking, action).
"""
# Rule 1: Prefer XML-like tag parsing when answer tags are present.
if "<answer>" in content and "</answer>" in content:
parts = content.split("<answer>", 1)
thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
action = parts[1].split("</answer>", 1)[0].strip()
return thinking, action

# Rule 2: Check for finish(message=
if "finish(message=" in content:
parts = content.split("finish(message=", 1)
thinking = parts[0].strip()
action = "finish(message=" + parts[1]
return thinking, action

# Rule 3: Check for do(action=
if "do(action=" in content:
parts = content.split("do(action=", 1)
thinking = parts[0].strip()
action = "do(action=" + parts[1]
return thinking, action

# Rule 4: No markers found, return content as action
return "", content


class MessageBuilder:
Expand Down
49 changes: 49 additions & 0 deletions tests/test_parse_response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from phone_agent.model.client import parse_response


def test_parse_response() -> None:
content = """<think>先思考</think>
<answer>
do(action="Launch", app="知乎")
</answer>"""
thinking, action = parse_response(content)
assert thinking == "先思考"
assert action == 'do(action="Launch", app="知乎")'

content = """<think>用户需要打开知乎查看首页第一条消息,当前在系统桌面,首先需要启动知乎应用,因此执行Launch操作打开知乎。</think>
<answer>
do(action="Launch", app="知乎")
</answer>"""
thinking, action = parse_response(content)
assert (
thinking
== "用户需要打开知乎查看首页第一条消息,当前在系统桌面,首先需要启动知乎应用,因此执行Launch操作打开知乎。"
)
assert action == 'do(action="Launch", app="知乎")'

content = '先总结一下\nfinish(message="任务完成")'
thinking, action = parse_response(content)
assert thinking == "先总结一下"
assert action == 'finish(message="任务完成")'

content = '先分析页面元素\ndo(action="Tap", element=[120,240])'
thinking, action = parse_response(content)
assert thinking == "先分析页面元素"
assert action == 'do(action="Tap", element=[120,240])'

content = '直接输出动作 do(action="Back")'
thinking, action = parse_response(content)
assert thinking == "直接输出动作"
assert action == 'do(action="Back")'

content = "这是一段没有动作标记的普通文本"
thinking, action = parse_response(content)
assert thinking == ""
assert action == content

content = """<think>先思考</think>
<answer>
do(action="Launch", app="知乎")"""
thinking, action = parse_response(content)
assert thinking == "<think>先思考</think>\n<answer>"
assert action == 'do(action="Launch", app="知乎")'