diff --git a/tests/benchmarks/appworld/mcp_server.py b/tests/benchmarks/appworld/mcp_server.py index f8a7c3f..aed4068 100644 --- a/tests/benchmarks/appworld/mcp_server.py +++ b/tests/benchmarks/appworld/mcp_server.py @@ -251,6 +251,11 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> Any: ), ) + # Clean up database connections before exiting + # This ensures SQLite connections are closed and the process can exit cleanly + collections.model_collection.close() + collections.apis.close() + async def main() -> None: parser = argparse.ArgumentParser(description="AppWorld MCP Server with task-specific state") diff --git a/tests/benchmarks/appworld/prompts.py b/tests/benchmarks/appworld/prompts.py index 5b546d5..aee66f1 100644 --- a/tests/benchmarks/appworld/prompts.py +++ b/tests/benchmarks/appworld/prompts.py @@ -8,6 +8,7 @@ from appworld.common.io import dump_yaml, read_file, read_json from appworld.common.text import render_template from appworld.task import Task +from fast_agent.mcp.common import create_namespaced_name # Path to installed appworld_experiments package EXPERIMENTS_PATH = Path(appworld_experiments.__file__).parent @@ -54,8 +55,14 @@ def load_system_instruction(task: Task) -> str: return base_instruction + demo_text -def _format_demo_messages(demo_messages: list[dict[str, Any]]) -> str: - """Format demo messages as readable conversation.""" +def _format_demo_messages(demo_messages: list[dict[str, Any]], server_name: str = "appworld") -> str: + """ + Format demo messages as readable conversation. + + Args: + demo_messages: List of demo message dictionaries + server_name: MCP server name (default: "appworld") + """ demo_text_parts = ["\n"] for msg in demo_messages: @@ -72,13 +79,14 @@ def _format_demo_messages(demo_messages: list[dict[str, Any]]) -> str: calls = [] for tc in tool_calls: func_name = tc["function"]["name"] + prefixed_name = create_namespaced_name(server_name, func_name) func_args = tc["function"]["arguments"] args_dict = json.loads(func_args) if isinstance(func_args, str) else func_args if args_dict: args_str = ", ".join(f"{k}={repr(v)}" for k, v in args_dict.items()) - calls.append(f"{func_name}({args_str})") + calls.append(f"{prefixed_name}({args_str})") else: - calls.append(f"{func_name}()") + calls.append(f"{prefixed_name}()") demo_text_parts.append("\n" + "\n".join(calls)) elif content: demo_text_parts.append(f"\n{content}") diff --git a/tests/benchmarks/appworld/system_instruction.txt b/tests/benchmarks/appworld/system_instruction.txt index 9b44e1b..f777a4f 100644 --- a/tests/benchmarks/appworld/system_instruction.txt +++ b/tests/benchmarks/appworld/system_instruction.txt @@ -3,9 +3,9 @@ I am your supervisor, and you are an AI Assistant whose job is to complete my da My name is: {{ main_user.first_name }} {{ main_user.last_name }}. My personal email is {{ main_user.email }} and phone number is {{ main_user.phone_number }}. -You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has two parts, the app name and API name separated by "__", e.g., spotify__login is the login API for the Spotify app. +You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has three parts: the server name "appworld", the app name, and the API name, all separated by "__" (double underscore). For example, appworld__spotify__login is the login API for the Spotify app. -You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call `complete_task` API from the Supervisor app. +You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call the appworld__supervisor__complete_task API. Here are brief app-wise descriptions. @@ -35,7 +35,7 @@ B. App-specific instructions: C. Task-completion instructions: -You must call the `supervisor__complete_task` API after completing the task. +You must call the `appworld__supervisor__complete_task` API after completing the task. - If an answer is needed, e.g., for "How many songs are in the Spotify queue?", call it with the appropriate answer argument value. - If no answer is required, e.g., for "Start my Spotify music player.", omit the answer argument (or set it to None/null). - The task is doable, but if you cannot find a way, you can call it with status="fail" to exit with failure.