chughtapan · chughtapan · Nov 7, 2025 · Nov 7, 2025
diff --git a/tests/benchmarks/appworld/mcp_server.py b/tests/benchmarks/appworld/mcp_server.py
@@ -251,6 +251,11 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> Any:
             ),
         )
 
+    # Clean up database connections before exiting
+    # This ensures SQLite connections are closed and the process can exit cleanly
+    collections.model_collection.close()
+    collections.apis.close()
+
 
 async def main() -> None:
     parser = argparse.ArgumentParser(description="AppWorld MCP Server with task-specific state")

diff --git a/tests/benchmarks/appworld/prompts.py b/tests/benchmarks/appworld/prompts.py
@@ -8,6 +8,7 @@
 from appworld.common.io import dump_yaml, read_file, read_json
 from appworld.common.text import render_template
 from appworld.task import Task
+from fast_agent.mcp.common import create_namespaced_name
 
 # Path to installed appworld_experiments package
 EXPERIMENTS_PATH = Path(appworld_experiments.__file__).parent
@@ -54,8 +55,14 @@ def load_system_instruction(task: Task) -> str:
     return base_instruction + demo_text
 
 
-def _format_demo_messages(demo_messages: list[dict[str, Any]]) -> str:
-    """Format demo messages as readable conversation."""
+def _format_demo_messages(demo_messages: list[dict[str, Any]], server_name: str = "appworld") -> str:
+    """
+    Format demo messages as readable conversation.
+
+    Args:
+        demo_messages: List of demo message dictionaries
+        server_name: MCP server name (default: "appworld")
+    """
     demo_text_parts = ["\n"]
 
     for msg in demo_messages:
@@ -72,13 +79,14 @@ def _format_demo_messages(demo_messages: list[dict[str, Any]]) -> str:
                 calls = []
                 for tc in tool_calls:
                     func_name = tc["function"]["name"]
+                    prefixed_name = create_namespaced_name(server_name, func_name)
                     func_args = tc["function"]["arguments"]
                     args_dict = json.loads(func_args) if isinstance(func_args, str) else func_args
                     if args_dict:
                         args_str = ", ".join(f"{k}={repr(v)}" for k, v in args_dict.items())
-                        calls.append(f"{func_name}({args_str})")
+                        calls.append(f"{prefixed_name}({args_str})")
                     else:
-                        calls.append(f"{func_name}()")
+                        calls.append(f"{prefixed_name}()")
                 demo_text_parts.append("\n" + "\n".join(calls))
             elif content:
                 demo_text_parts.append(f"\n{content}")

diff --git a/tests/benchmarks/appworld/system_instruction.txt b/tests/benchmarks/appworld/system_instruction.txt
@@ -3,9 +3,9 @@ I am your supervisor, and you are an AI Assistant whose job is to complete my da
 
 My name is: {{ main_user.first_name }} {{ main_user.last_name }}. My personal email is {{ main_user.email }} and phone number is {{ main_user.phone_number }}.
 
-You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has two parts, the app name and API name separated by "__", e.g., spotify__login is the login API for the Spotify app.
+You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has three parts: the server name "appworld", the app name, and the API name, all separated by "__" (double underscore). For example, appworld__spotify__login is the login API for the Spotify app.
 
-You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call `complete_task` API from the Supervisor app.
+You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call the appworld__supervisor__complete_task API.
 
 Here are brief app-wise descriptions.
 
@@ -35,7 +35,7 @@ B. App-specific instructions:
 
 C. Task-completion instructions:
 
-You must call the `supervisor__complete_task` API after completing the task.
+You must call the `appworld__supervisor__complete_task` API after completing the task.
 - If an answer is needed, e.g., for "How many songs are in the Spotify queue?", call it with the appropriate answer argument value.
 - If no answer is required, e.g., for "Start my Spotify music player.", omit the answer argument (or set it to None/null).
 - The task is doable, but if you cannot find a way, you can call it with status="fail" to exit with failure.