Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions tests/benchmarks/appworld/mcp_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,11 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> Any:
),
)

# Clean up database connections before exiting
# This ensures SQLite connections are closed and the process can exit cleanly
collections.model_collection.close()
collections.apis.close()


async def main() -> None:
parser = argparse.ArgumentParser(description="AppWorld MCP Server with task-specific state")
Expand Down
16 changes: 12 additions & 4 deletions tests/benchmarks/appworld/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from appworld.common.io import dump_yaml, read_file, read_json
from appworld.common.text import render_template
from appworld.task import Task
from fast_agent.mcp.common import create_namespaced_name

# Path to installed appworld_experiments package
EXPERIMENTS_PATH = Path(appworld_experiments.__file__).parent
Expand Down Expand Up @@ -54,8 +55,14 @@ def load_system_instruction(task: Task) -> str:
return base_instruction + demo_text


def _format_demo_messages(demo_messages: list[dict[str, Any]]) -> str:
"""Format demo messages as readable conversation."""
def _format_demo_messages(demo_messages: list[dict[str, Any]], server_name: str = "appworld") -> str:
"""
Format demo messages as readable conversation.

Args:
demo_messages: List of demo message dictionaries
server_name: MCP server name (default: "appworld")
"""
demo_text_parts = ["\n"]

for msg in demo_messages:
Expand All @@ -72,13 +79,14 @@ def _format_demo_messages(demo_messages: list[dict[str, Any]]) -> str:
calls = []
for tc in tool_calls:
func_name = tc["function"]["name"]
prefixed_name = create_namespaced_name(server_name, func_name)
func_args = tc["function"]["arguments"]
args_dict = json.loads(func_args) if isinstance(func_args, str) else func_args
if args_dict:
args_str = ", ".join(f"{k}={repr(v)}" for k, v in args_dict.items())
calls.append(f"{func_name}({args_str})")
calls.append(f"{prefixed_name}({args_str})")
else:
calls.append(f"{func_name}()")
calls.append(f"{prefixed_name}()")
demo_text_parts.append("\n" + "\n".join(calls))
elif content:
demo_text_parts.append(f"\n{content}")
Expand Down
6 changes: 3 additions & 3 deletions tests/benchmarks/appworld/system_instruction.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ I am your supervisor, and you are an AI Assistant whose job is to complete my da

My name is: {{ main_user.first_name }} {{ main_user.last_name }}. My personal email is {{ main_user.email }} and phone number is {{ main_user.phone_number }}.

You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has two parts, the app name and API name separated by "__", e.g., spotify__login is the login API for the Spotify app.
You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has three parts: the server name "appworld", the app name, and the API name, all separated by "__" (double underscore). For example, appworld__spotify__login is the login API for the Spotify app.

You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call `complete_task` API from the Supervisor app.
You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call the appworld__supervisor__complete_task API.

Here are brief app-wise descriptions.

Expand Down Expand Up @@ -35,7 +35,7 @@ B. App-specific instructions:

C. Task-completion instructions:

You must call the `supervisor__complete_task` API after completing the task.
You must call the `appworld__supervisor__complete_task` API after completing the task.
- If an answer is needed, e.g., for "How many songs are in the Spotify queue?", call it with the appropriate answer argument value.
- If no answer is required, e.g., for "Start my Spotify music player.", omit the answer argument (or set it to None/null).
- The task is doable, but if you cannot find a way, you can call it with status="fail" to exit with failure.
Expand Down