diff --git a/phone_agent/model/client.py b/phone_agent/model/client.py index 72377a61..0e001e42 100644 --- a/phone_agent/model/client.py +++ b/phone_agent/model/client.py @@ -143,7 +143,8 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse: total_time = time.time() - start_time # Parse thinking and action from response - thinking, action = self._parse_response(raw_content) + thinking, action = parse_response(raw_content) + print(f"{thinking=}, {action=}") # Print performance metrics lang = self.config.lang @@ -173,47 +174,48 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse: total_time=total_time, ) - def _parse_response(self, content: str) -> tuple[str, str]: - """ - Parse the model response into thinking and action parts. - Parsing rules: - 1. If content contains 'finish(message=', everything before is thinking, - everything from 'finish(message=' onwards is action. - 2. If rule 1 doesn't apply but content contains 'do(action=', - everything before is thinking, everything from 'do(action=' onwards is action. - 3. Fallback: If content contains '', use legacy parsing with XML tags. - 4. Otherwise, return empty thinking and full content as action. +def parse_response(content: str) -> tuple[str, str]: + """ + Parse the model response into thinking and action parts. - Args: - content: Raw response content. + Parsing rules: + 1. If content contains '', parse by XML-like tags first. + 2. If rule 1 doesn't apply but content contains 'finish(message=', + everything before is thinking, everything from 'finish(message=' onwards is action. + 3. If rule 2 doesn't apply but content contains 'do(action=', + everything before is thinking, everything from 'do(action=' onwards is action. + 4. Otherwise, return empty thinking and full content as action. - Returns: - Tuple of (thinking, action). - """ - # Rule 1: Check for finish(message= - if "finish(message=" in content: - parts = content.split("finish(message=", 1) - thinking = parts[0].strip() - action = "finish(message=" + parts[1] - return thinking, action - - # Rule 2: Check for do(action= - if "do(action=" in content: - parts = content.split("do(action=", 1) - thinking = parts[0].strip() - action = "do(action=" + parts[1] - return thinking, action - - # Rule 3: Fallback to legacy XML tag parsing - if "" in content: - parts = content.split("", 1) - thinking = parts[0].replace("", "").replace("", "").strip() - action = parts[1].replace("", "").strip() - return thinking, action - - # Rule 4: No markers found, return content as action - return "", content + Args: + content: Raw response content. + + Returns: + Tuple of (thinking, action). + """ + # Rule 1: Prefer XML-like tag parsing when answer tags are present. + if "" in content and "" in content: + parts = content.split("", 1) + thinking = parts[0].replace("", "").replace("", "").strip() + action = parts[1].split("", 1)[0].strip() + return thinking, action + + # Rule 2: Check for finish(message= + if "finish(message=" in content: + parts = content.split("finish(message=", 1) + thinking = parts[0].strip() + action = "finish(message=" + parts[1] + return thinking, action + + # Rule 3: Check for do(action= + if "do(action=" in content: + parts = content.split("do(action=", 1) + thinking = parts[0].strip() + action = "do(action=" + parts[1] + return thinking, action + + # Rule 4: No markers found, return content as action + return "", content class MessageBuilder: diff --git a/tests/test_parse_response.py b/tests/test_parse_response.py new file mode 100644 index 00000000..ddb19147 --- /dev/null +++ b/tests/test_parse_response.py @@ -0,0 +1,49 @@ +from phone_agent.model.client import parse_response + + +def test_parse_response() -> None: + content = """先思考 + +do(action="Launch", app="知乎") +""" + thinking, action = parse_response(content) + assert thinking == "先思考" + assert action == 'do(action="Launch", app="知乎")' + + content = """用户需要打开知乎查看首页第一条消息,当前在系统桌面,首先需要启动知乎应用,因此执行Launch操作打开知乎。 + +do(action="Launch", app="知乎") +""" + thinking, action = parse_response(content) + assert ( + thinking + == "用户需要打开知乎查看首页第一条消息,当前在系统桌面,首先需要启动知乎应用,因此执行Launch操作打开知乎。" + ) + assert action == 'do(action="Launch", app="知乎")' + + content = '先总结一下\nfinish(message="任务完成")' + thinking, action = parse_response(content) + assert thinking == "先总结一下" + assert action == 'finish(message="任务完成")' + + content = '先分析页面元素\ndo(action="Tap", element=[120,240])' + thinking, action = parse_response(content) + assert thinking == "先分析页面元素" + assert action == 'do(action="Tap", element=[120,240])' + + content = '直接输出动作 do(action="Back")' + thinking, action = parse_response(content) + assert thinking == "直接输出动作" + assert action == 'do(action="Back")' + + content = "这是一段没有动作标记的普通文本" + thinking, action = parse_response(content) + assert thinking == "" + assert action == content + + content = """先思考 + +do(action="Launch", app="知乎")""" + thinking, action = parse_response(content) + assert thinking == "先思考\n" + assert action == 'do(action="Launch", app="知乎")'