diff --git a/tests/benchmarks/appworld/mcp_server.py b/tests/benchmarks/appworld/mcp_server.py index bbb7eb9..f8a7c3f 100644 --- a/tests/benchmarks/appworld/mcp_server.py +++ b/tests/benchmarks/appworld/mcp_server.py @@ -230,7 +230,7 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> Any: # Save databases on task completion if api_name == "complete_task" or name == "supervisor__complete_task": Path(db_paths.output_db_path).mkdir(parents=True, exist_ok=True) - collections.model_collection.save(db_home_path=db_paths.output_db_path) + collections.model_collection.save(db_home_path=db_paths.output_db_path, save_model_hashes=True) return format_tool_response(response) except Exception as e: diff --git a/tests/benchmarks/appworld/prompts.py b/tests/benchmarks/appworld/prompts.py index c43d777..5b546d5 100644 --- a/tests/benchmarks/appworld/prompts.py +++ b/tests/benchmarks/appworld/prompts.py @@ -13,13 +13,12 @@ EXPERIMENTS_PATH = Path(appworld_experiments.__file__).parent -def load_system_instruction(task: Task, max_steps: int = 40) -> str: +def load_system_instruction(task: Task) -> str: """ Load and render system instruction from AppWorld's template with demo examples. Args: task: AppWorld Task object - max_steps: Maximum number of turns allowed Returns: Rendered system instruction with supervisor info, rules, and demos @@ -40,7 +39,6 @@ def load_system_instruction(task: Task, max_steps: int = 40) -> str: template_content, main_user=task.supervisor, app_descriptions=app_descriptions_yaml, - max_steps=max_steps, ) # Load demo messages and format them diff --git a/tests/benchmarks/appworld/system_instruction.txt b/tests/benchmarks/appworld/system_instruction.txt index e363781..9b44e1b 100644 --- a/tests/benchmarks/appworld/system_instruction.txt +++ b/tests/benchmarks/appworld/system_instruction.txt @@ -5,7 +5,7 @@ My name is: {{ main_user.first_name }} {{ main_user.last_name }}. My personal em You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has two parts, the app name and API name separated by "__", e.g., spotify__login is the login API for the Spotify app. -You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue either until you call `complete_task` API from the Supervisor app, or until a maximum of {max_steps} turns are reached. +You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call `complete_task` API from the Supervisor app. Here are brief app-wise descriptions. @@ -21,7 +21,7 @@ A. General instructions: - Never leave placeholders; don't output things like "your_username". Always fill in the real value by retrieving it via APIs (e.g., Supervisor app for credentials). - When I omit details, choose any valid value. For example, if I ask you to buy something but don't specify which payment card to use, you may pick any one of my available cards. - Avoid collateral damage. Only perform what I explicitly ask for. Example: if I ask you to buy something, do not delete emails, return the order, or perform unrelated account operations. -- You only have {max_steps} turns. Avoid unnecessary requests. You can batch unlimited function calls in a single turn - always group them to save steps. +- Avoid unnecessary requests. B. App-specific instructions: diff --git a/tests/benchmarks/appworld/test_appworld.py b/tests/benchmarks/appworld/test_appworld.py index c4ca3c3..4fd7772 100644 --- a/tests/benchmarks/appworld/test_appworld.py +++ b/tests/benchmarks/appworld/test_appworld.py @@ -50,7 +50,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: @pytest.mark.asyncio -@pytest.mark.timeout(300) +@pytest.mark.timeout(900) async def test_appworld( task_id: str, model: str, diff --git a/uv.lock b/uv.lock index 5f4863f..d16fe1b 100644 --- a/uv.lock +++ b/uv.lock @@ -1090,7 +1090,7 @@ name = "grpcio" version = "1.76.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions" }, + { name = "typing-extensions", marker = "python_full_version < '4'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b6/e0/318c1ce3ae5a17894d5791e87aea147587c9e702f24122cc7a5c8bbaeeb1/grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73", size = 12785182, upload-time = "2025-10-21T16:23:12.106Z" } wheels = [ @@ -2053,8 +2053,8 @@ name = "opentelemetry-exporter-otlp" version = "1.38.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "opentelemetry-exporter-otlp-proto-grpc" }, - { name = "opentelemetry-exporter-otlp-proto-http" }, + { name = "opentelemetry-exporter-otlp-proto-grpc", marker = "python_full_version < '4'" }, + { name = "opentelemetry-exporter-otlp-proto-http", marker = "python_full_version < '4'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c2/2d/16e3487ddde2dee702bd746dd41950a8789b846d22a1c7e64824aac5ebea/opentelemetry_exporter_otlp-1.38.0.tar.gz", hash = "sha256:2f55acdd475e4136117eff20fbf1b9488b1b0b665ab64407516e1ac06f9c3f9d", size = 6147, upload-time = "2025-10-16T08:35:52.53Z" } wheels = [ @@ -2078,13 +2078,13 @@ name = "opentelemetry-exporter-otlp-proto-grpc" version = "1.38.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "googleapis-common-protos" }, - { name = "grpcio" }, - { name = "opentelemetry-api" }, - { name = "opentelemetry-exporter-otlp-proto-common" }, - { name = "opentelemetry-proto" }, - { name = "opentelemetry-sdk" }, - { name = "typing-extensions" }, + { name = "googleapis-common-protos", marker = "python_full_version < '4'" }, + { name = "grpcio", marker = "python_full_version < '4'" }, + { name = "opentelemetry-api", marker = "python_full_version < '4'" }, + { name = "opentelemetry-exporter-otlp-proto-common", marker = "python_full_version < '4'" }, + { name = "opentelemetry-proto", marker = "python_full_version < '4'" }, + { name = "opentelemetry-sdk", marker = "python_full_version < '4'" }, + { name = "typing-extensions", marker = "python_full_version < '4'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a2/c0/43222f5b97dc10812bc4f0abc5dc7cd0a2525a91b5151d26c9e2e958f52e/opentelemetry_exporter_otlp_proto_grpc-1.38.0.tar.gz", hash = "sha256:2473935e9eac71f401de6101d37d6f3f0f1831db92b953c7dcc912536158ebd6", size = 24676, upload-time = "2025-10-16T08:35:53.83Z" } wheels = [ @@ -2129,10 +2129,10 @@ name = "opentelemetry-instrumentation-anthropic" version = "0.47.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "opentelemetry-api" }, - { name = "opentelemetry-instrumentation" }, - { name = "opentelemetry-semantic-conventions" }, - { name = "opentelemetry-semantic-conventions-ai" }, + { name = "opentelemetry-api", marker = "python_full_version < '4'" }, + { name = "opentelemetry-instrumentation", marker = "python_full_version < '4'" }, + { name = "opentelemetry-semantic-conventions", marker = "python_full_version < '4'" }, + { name = "opentelemetry-semantic-conventions-ai", marker = "python_full_version < '4'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/87/ee/57e88683d32a49f1ff4ec2b1ed94359d7784135b63d8e0d6bc0b8cc4aa31/opentelemetry_instrumentation_anthropic-0.47.4.tar.gz", hash = "sha256:4ec2ec5f85a85e7fad492a3cea81e8e589fd87e2ecc6bb7dde6f008ca6b337f2", size = 14684, upload-time = "2025-10-22T17:38:06.171Z" } wheels = [ @@ -2159,11 +2159,11 @@ name = "opentelemetry-instrumentation-mcp" version = "0.47.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "opentelemetry-api" }, - { name = "opentelemetry-exporter-otlp" }, - { name = "opentelemetry-instrumentation" }, - { name = "opentelemetry-semantic-conventions" }, - { name = "opentelemetry-semantic-conventions-ai" }, + { name = "opentelemetry-api", marker = "python_full_version < '4'" }, + { name = "opentelemetry-exporter-otlp", marker = "python_full_version < '4'" }, + { name = "opentelemetry-instrumentation", marker = "python_full_version < '4'" }, + { name = "opentelemetry-semantic-conventions", marker = "python_full_version < '4'" }, + { name = "opentelemetry-semantic-conventions-ai", marker = "python_full_version < '4'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/67/ad/2894d1e405da43e34377a5025ecae4ad4ae1d0fbe0ad1a40f4fab8e2999a/opentelemetry_instrumentation_mcp-0.47.4.tar.gz", hash = "sha256:5aa03da02db479c4d846e52d240150aac9773e02a3cbaec07ba9a9bf3799cb58", size = 8832, upload-time = "2025-10-22T17:38:16.71Z" } wheels = [ @@ -2175,10 +2175,10 @@ name = "opentelemetry-instrumentation-openai" version = "0.47.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "opentelemetry-api" }, - { name = "opentelemetry-instrumentation" }, - { name = "opentelemetry-semantic-conventions" }, - { name = "opentelemetry-semantic-conventions-ai" }, + { name = "opentelemetry-api", marker = "python_full_version < '4'" }, + { name = "opentelemetry-instrumentation", marker = "python_full_version < '4'" }, + { name = "opentelemetry-semantic-conventions", marker = "python_full_version < '4'" }, + { name = "opentelemetry-semantic-conventions-ai", marker = "python_full_version < '4'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/98/46/eaf1a823c0f64cd737c9e112bf951e6f8b9bc2833d889b509543add8582a/opentelemetry_instrumentation_openai-0.47.4.tar.gz", hash = "sha256:3e70d9b44589886ee90a4bb4f4047f2a565207ba7164a09fc20425cdfb9f34ca", size = 25412, upload-time = "2025-10-22T17:38:20.376Z" } wheels = [ @@ -2850,6 +2850,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/93/2fa34714b7a4ae72f2f8dad66ba17dd9a2c793220719e736dda28b7aec27/pytest_asyncio-1.2.0-py3-none-any.whl", hash = "sha256:8e17ae5e46d8e7efe51ab6494dd2010f4ca8dae51652aa3c8d55acf50bfb2e99", size = 15095, upload-time = "2025-09-12T07:33:52.639Z" }, ] +[[package]] +name = "pytest-timeout" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973, upload-time = "2025-05-05T19:44:34.99Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" }, +] + [[package]] name = "pytest-xdist" version = "3.8.0" @@ -3708,6 +3720,8 @@ dev = [ { name = "mypy" }, { name = "pytest" }, { name = "pytest-asyncio" }, + { name = "pytest-timeout" }, + { name = "pytest-xdist" }, { name = "ruff" }, ] evals = [ @@ -3729,6 +3743,8 @@ requires-dist = [ { name = "mypy", marker = "extra == 'dev'" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.21" }, + { name = "pytest-timeout", marker = "extra == 'dev'", specifier = ">=2.0" }, + { name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.0" }, { name = "rich", specifier = ">=13.0.0" }, { name = "ruff", marker = "extra == 'dev'" }, ]