From 8758142b670593cbca89158a423752067525501b Mon Sep 17 00:00:00 2001 From: Khushi Bhardwaj Date: Tue, 18 Nov 2025 13:42:12 -0800 Subject: [PATCH 1/7] ruff Signed-off-by: Khushi Bhardwaj --- .../terminus_format_server/README.md | 11 +++++ .../terminus_format_server/app.py | 46 +++++++++++++++++++ .../configs/terminus_format_server.yaml | 38 +++++++++++++++ .../terminus_format_server/data/.gitignore | 5 ++ .../terminus_format_server/requirements.txt | 2 + .../terminus_format_server/tests/test_app.py | 32 +++++++++++++ 6 files changed, 134 insertions(+) create mode 100644 resources_servers/terminus_format_server/README.md create mode 100644 resources_servers/terminus_format_server/app.py create mode 100644 resources_servers/terminus_format_server/configs/terminus_format_server.yaml create mode 100644 resources_servers/terminus_format_server/data/.gitignore create mode 100644 resources_servers/terminus_format_server/requirements.txt create mode 100644 resources_servers/terminus_format_server/tests/test_app.py diff --git a/resources_servers/terminus_format_server/README.md b/resources_servers/terminus_format_server/README.md new file mode 100644 index 000000000..b56a438df --- /dev/null +++ b/resources_servers/terminus_format_server/README.md @@ -0,0 +1,11 @@ +# Description + +Data links: ? + +# Licensing information +Code: ? +Data: ? + +Dependencies +- nemo_gym: Apache 2.0 +? diff --git a/resources_servers/terminus_format_server/app.py b/resources_servers/terminus_format_server/app.py new file mode 100644 index 000000000..754d303b6 --- /dev/null +++ b/resources_servers/terminus_format_server/app.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from fastapi import FastAPI + +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseVerifyRequest, + BaseVerifyResponse, + SimpleResourcesServer, +) + + +class TerminusFormatServerResourcesServerConfig(BaseResourcesServerConfig): + pass + + +class TerminusFormatServerResourcesServer(SimpleResourcesServer): + config: TerminusFormatServerResourcesServerConfig + + def setup_webserver(self) -> FastAPI: + app = super().setup_webserver() + + # Additional server routes go here! e.g.: + # app.post("/get_weather")(self.get_weather) + + return app + + async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: + return BaseVerifyResponse(**body.model_dump(), reward=1.0) + + +if __name__ == "__main__": + TerminusFormatServerResourcesServer.run_webserver() diff --git a/resources_servers/terminus_format_server/configs/terminus_format_server.yaml b/resources_servers/terminus_format_server/configs/terminus_format_server.yaml new file mode 100644 index 000000000..23e2ce9fa --- /dev/null +++ b/resources_servers/terminus_format_server/configs/terminus_format_server.yaml @@ -0,0 +1,38 @@ +terminus_format_server_resources_server: + resources_servers: + terminus_format_server: + entrypoint: app.py + verified: false +terminus_format_server_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: terminus_format_server_resources_server + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/terminus_format_server/data/train.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: terminus_format_server + version: 0.0.1 + artifact_fpath: train.jsonl + license: Apache 2.0 + - name: validation + type: validation + jsonl_fpath: resources_servers/terminus_format_server/data/validation.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: terminus_format_server + version: 0.0.1 + artifact_fpath: validation.jsonl + license: Apache 2.0 + - name: example + type: example + jsonl_fpath: resources_servers/terminus_format_server/data/example.jsonl + num_repeats: 1 diff --git a/resources_servers/terminus_format_server/data/.gitignore b/resources_servers/terminus_format_server/data/.gitignore new file mode 100644 index 000000000..4424b6fde --- /dev/null +++ b/resources_servers/terminus_format_server/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/resources_servers/terminus_format_server/requirements.txt b/resources_servers/terminus_format_server/requirements.txt new file mode 100644 index 000000000..3f80e8452 --- /dev/null +++ b/resources_servers/terminus_format_server/requirements.txt @@ -0,0 +1,2 @@ +-e nemo-gym[dev] @ ../../ +openapi-schema-validator==0.6.3 \ No newline at end of file diff --git a/resources_servers/terminus_format_server/tests/test_app.py b/resources_servers/terminus_format_server/tests/test_app.py new file mode 100644 index 000000000..047b8dc6e --- /dev/null +++ b/resources_servers/terminus_format_server/tests/test_app.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest.mock import MagicMock + +from nemo_gym.server_utils import ServerClient +from resources_servers.terminus_format_server.app import ( + TerminusFormatServerResourcesServer, + TerminusFormatServerResourcesServerConfig, +) + + +class TestApp: + def test_sanity(self) -> None: + config = TerminusFormatServerResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + ) + TerminusFormatServerResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) From 5eb98179b4217cf44a3a9e8d54aa21841e8aaa0e Mon Sep 17 00:00:00 2001 From: Khushi Bhardwaj Date: Tue, 18 Nov 2025 18:56:09 -0800 Subject: [PATCH 2/7] update app.py Signed-off-by: Khushi Bhardwaj --- .../terminus_format_server/app.py | 92 ++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/resources_servers/terminus_format_server/app.py b/resources_servers/terminus_format_server/app.py index 754d303b6..88723bd95 100644 --- a/resources_servers/terminus_format_server/app.py +++ b/resources_servers/terminus_format_server/app.py @@ -13,7 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +from typing import Any, Dict + from fastapi import FastAPI +from openapi_schema_validator import validate as validate_against_schema_openapi from nemo_gym.base_resources_server import ( BaseResourcesServerConfig, @@ -27,6 +31,60 @@ class TerminusFormatServerResourcesServerConfig(BaseResourcesServerConfig): pass +# Fixed JSON schema for the terminal agent response. +TERMINUS_FORMAT_SCHEMA: Dict[str, Any] = { + "type": "object", + "title": "terminal_agent_response", + "properties": { + "analysis": {"type": "string"}, + "plan": {"type": "string"}, + "commands": { + "type": "array", + "items": { + "type": "object", + "properties": { + "keystrokes": {"type": "string"}, + "duration": { + "type": "number", + "default": 1.0, + "minimum": 0, + }, + }, + "required": ["keystrokes"], + "additionalProperties": False, + }, + }, + "task_complete": { + "type": "boolean", + "default": False, + }, + }, + "required": ["analysis", "plan", "commands"], + "additionalProperties": False, + # commands must be EITHER: + # - empty array: [] + # - OR array with ≥1 item (and keystrokes required per item) + "anyOf": [ + { + "properties": { + "commands": { + "type": "array", + "maxItems": 0, + } + } + }, + { + "properties": { + "commands": { + "type": "array", + "minItems": 1, + } + } + }, + ], +} + + class TerminusFormatServerResourcesServer(SimpleResourcesServer): config: TerminusFormatServerResourcesServerConfig @@ -39,7 +97,39 @@ def setup_webserver(self) -> FastAPI: return app async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: - return BaseVerifyResponse(**body.model_dump(), reward=1.0) + assistant_responses = [] + for output_item in body.response.output: + if output_item.type != "message": + continue + + for content_item in output_item.content: + if content_item.type != "output_text": + continue + + assistant_responses.append(content_item.text) + + response_text = "".join(assistant_responses) + + reward = self.evaluate_terminus_format_response_json(response_text) + return BaseVerifyResponse(**body.model_dump(), reward=reward) + + # ----- JSON Helpers ----- # + def evaluate_terminus_format_response_json(self, response_text: str) -> float: + """Validate the model response against the fixed terminus format schema.""" + try: + response_obj = json.loads(response_text) + except Exception: + # Not valid JSON + return 0.0 + + try: + validate_against_schema_openapi(response_obj, TERMINUS_FORMAT_SCHEMA) + except Exception: + # JSON but does not match schema + return 0.0 + + # Valid JSON and matches schema + return 1.0 if __name__ == "__main__": From 9971a03d2918804f51b94cba2dac04347045d1ee Mon Sep 17 00:00:00 2001 From: Khushi Bhardwaj Date: Wed, 19 Nov 2025 15:50:54 -0800 Subject: [PATCH 3/7] change name Signed-off-by: Khushi Bhardwaj --- resources_servers/terminus_format/README.md | 41 +++++++++++++++++++ .../app.py | 9 ++-- .../configs/terminus_format.yaml} | 19 +++++---- .../data/.gitignore | 0 .../terminus_format/data/example.jsonl | 5 +++ .../requirements.txt | 0 .../tests/test_app.py | 0 .../terminus_format_server/README.md | 11 ----- 8 files changed, 61 insertions(+), 24 deletions(-) create mode 100644 resources_servers/terminus_format/README.md rename resources_servers/{terminus_format_server => terminus_format}/app.py (93%) rename resources_servers/{terminus_format_server/configs/terminus_format_server.yaml => terminus_format/configs/terminus_format.yaml} (59%) rename resources_servers/{terminus_format_server => terminus_format}/data/.gitignore (100%) create mode 100644 resources_servers/terminus_format/data/example.jsonl rename resources_servers/{terminus_format_server => terminus_format}/requirements.txt (100%) rename resources_servers/{terminus_format_server => terminus_format}/tests/test_app.py (100%) delete mode 100644 resources_servers/terminus_format_server/README.md diff --git a/resources_servers/terminus_format/README.md b/resources_servers/terminus_format/README.md new file mode 100644 index 000000000..7227fe2e7 --- /dev/null +++ b/resources_servers/terminus_format/README.md @@ -0,0 +1,41 @@ +# Description + +Data links: ? + +# Licensing information +Code: ? +Data: ? + +Download this artifact: +```bash +ng_download_dataset_from_gitlab \ + +dataset_name=terminus_format_dataset \ + +version=0.0.1 \ + +artifact_fpath=example.jsonl \ + +output_fpath=resources_servers/terminus_format/data/example.jsonl +``` + +Example data: +`resources_servers/terminus_format/data/example.jsonl` + +# Example usage + +The following are example commands for running this resource server, along with the simple agent and an OpenAI model: +```bash +config_paths="responses_api_models/openai_model/configs/openai_model.yaml, \ +resources_servers/terminus_format/configs/terminus_format.yaml" +ng_run "+config_paths=[$config_paths]" +``` + +Then, rollouts can be collected using a command such as the following: +```bash +ng_collect_rollouts \ + +agent_name=terminus_format_simple_agent \ + +input_jsonl_fpath=resources_servers/terminus_format/data/example.jsonl \ + +output_jsonl_fpath=results/example_terminus_format_json.jsonl \ + +limit=1 +``` + +Dependencies +- nemo_gym: Apache 2.0 +- openapi-schema-validator: [BSD-3-Clause license](https://github.com/python-openapi/openapi-schema-validator/blob/master/LICENSE) diff --git a/resources_servers/terminus_format_server/app.py b/resources_servers/terminus_format/app.py similarity index 93% rename from resources_servers/terminus_format_server/app.py rename to resources_servers/terminus_format/app.py index 88723bd95..6be0edef1 100644 --- a/resources_servers/terminus_format_server/app.py +++ b/resources_servers/terminus_format/app.py @@ -27,7 +27,7 @@ ) -class TerminusFormatServerResourcesServerConfig(BaseResourcesServerConfig): +class TerminusFormatResourcesServerConfig(BaseResourcesServerConfig): pass @@ -85,8 +85,8 @@ class TerminusFormatServerResourcesServerConfig(BaseResourcesServerConfig): } -class TerminusFormatServerResourcesServer(SimpleResourcesServer): - config: TerminusFormatServerResourcesServerConfig +class TerminusFormatResourcesServer(SimpleResourcesServer): + config: TerminusFormatResourcesServerConfig def setup_webserver(self) -> FastAPI: app = super().setup_webserver() @@ -109,6 +109,7 @@ async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: assistant_responses.append(content_item.text) response_text = "".join(assistant_responses) + print(response_text) reward = self.evaluate_terminus_format_response_json(response_text) return BaseVerifyResponse(**body.model_dump(), reward=reward) @@ -133,4 +134,4 @@ def evaluate_terminus_format_response_json(self, response_text: str) -> float: if __name__ == "__main__": - TerminusFormatServerResourcesServer.run_webserver() + TerminusFormatResourcesServer.run_webserver() diff --git a/resources_servers/terminus_format_server/configs/terminus_format_server.yaml b/resources_servers/terminus_format/configs/terminus_format.yaml similarity index 59% rename from resources_servers/terminus_format_server/configs/terminus_format_server.yaml rename to resources_servers/terminus_format/configs/terminus_format.yaml index 23e2ce9fa..5c77cbcb9 100644 --- a/resources_servers/terminus_format_server/configs/terminus_format_server.yaml +++ b/resources_servers/terminus_format/configs/terminus_format.yaml @@ -1,38 +1,39 @@ -terminus_format_server_resources_server: +terminus_format: resources_servers: - terminus_format_server: + terminus_format: entrypoint: app.py + domain: instruction_following verified: false -terminus_format_server_simple_agent: +terminus_format_simple_agent: responses_api_agents: simple_agent: entrypoint: app.py resources_server: type: resources_servers - name: terminus_format_server_resources_server + name: terminus_format model_server: type: responses_api_models name: policy_model datasets: - name: train type: train - jsonl_fpath: resources_servers/terminus_format_server/data/train.jsonl + jsonl_fpath: resources_servers/terminus_format/data/train.jsonl num_repeats: 1 gitlab_identifier: - dataset_name: terminus_format_server + dataset_name: terminus_format version: 0.0.1 artifact_fpath: train.jsonl license: Apache 2.0 - name: validation type: validation - jsonl_fpath: resources_servers/terminus_format_server/data/validation.jsonl + jsonl_fpath: resources_servers/terminus_format/data/validation.jsonl num_repeats: 1 gitlab_identifier: - dataset_name: terminus_format_server + dataset_name: terminus_format version: 0.0.1 artifact_fpath: validation.jsonl license: Apache 2.0 - name: example type: example - jsonl_fpath: resources_servers/terminus_format_server/data/example.jsonl + jsonl_fpath: resources_servers/terminus_format/data/example.jsonl num_repeats: 1 diff --git a/resources_servers/terminus_format_server/data/.gitignore b/resources_servers/terminus_format/data/.gitignore similarity index 100% rename from resources_servers/terminus_format_server/data/.gitignore rename to resources_servers/terminus_format/data/.gitignore diff --git a/resources_servers/terminus_format/data/example.jsonl b/resources_servers/terminus_format/data/example.jsonl new file mode 100644 index 000000000..dfbff59da --- /dev/null +++ b/resources_servers/terminus_format/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 0, "total_steps": 32}]}} +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I'm at the root prompt in a testbed environment. I need to first check what Python version is installed and then create a test case to reproduce the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll check the Python version and install typeguard if it's not already installed.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python --version\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 1, "total_steps": 32}]}} +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I'm at the root prompt in a testbed environment. I need to first check what Python version is installed and then create a test case to reproduce the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll check the Python version and install typeguard if it's not already installed.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python --version\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I've confirmed that Python 3.10.16 is installed, which matches the environment described in the task. Now I need to check if typeguard is installed and install it if needed.\\\",\\n \\\"explanation\\\": \\\"I'll check if typeguard is installed by trying to import it in Python. If it's not installed, I'll install it using pip.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python -c \\\\\\\"import typeguard; print('typeguard is installed')\\\\\\\"\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 3\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 2, "total_steps": 32}]}} +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I'm at the root prompt in a testbed environment. I need to first check what Python version is installed and then create a test case to reproduce the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll check the Python version and install typeguard if it's not already installed.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python --version\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I've confirmed that Python 3.10.16 is installed, which matches the environment described in the task. Now I need to check if typeguard is installed and install it if needed.\\\",\\n \\\"explanation\\\": \\\"I'll check if typeguard is installed by trying to import it in Python. If it's not installed, I'll install it using pip.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python -c \\\\\\\"import typeguard; print('typeguard is installed')\\\\\\\"\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 3\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"Typeguard is already installed in the environment. Now I need to create a simple Python module to test with and then create a script that reproduces the issue described in the task.\\\",\\n \\\"explanation\\\": \\\"I'll create a simple Python module called 'my_module.py' with some basic code, then create a test script that enables debug_instrumentation and tries to import the module.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"echo 'def hello():\\\\n return \\\\\\\"Hello, World!\\\\\\\"' > my_module.py\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed# echo 'def hello():\\n> return \\\"Hello, World!\\\"' > my_module.py; tmux wait -S done\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 3, "total_steps": 32}]}} +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I'm at the root prompt in a testbed environment. I need to first check what Python version is installed and then create a test case to reproduce the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll check the Python version and install typeguard if it's not already installed.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python --version\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I've confirmed that Python 3.10.16 is installed, which matches the environment described in the task. Now I need to check if typeguard is installed and install it if needed.\\\",\\n \\\"explanation\\\": \\\"I'll check if typeguard is installed by trying to import it in Python. If it's not installed, I'll install it using pip.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python -c \\\\\\\"import typeguard; print('typeguard is installed')\\\\\\\"\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 3\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"Typeguard is already installed in the environment. Now I need to create a simple Python module to test with and then create a script that reproduces the issue described in the task.\\\",\\n \\\"explanation\\\": \\\"I'll create a simple Python module called 'my_module.py' with some basic code, then create a test script that enables debug_instrumentation and tries to import the module.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"echo 'def hello():\\\\n return \\\\\\\"Hello, World!\\\\\\\"' > my_module.py\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed# echo 'def hello():\\n> return \\\"Hello, World!\\\"' > my_module.py; tmux wait -S done\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I've created a simple Python module called 'my_module.py'. Now I need to create a test script that reproduces the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll create a test script that enables debug_instrumentation, installs the import hook, and then tries to import the module we just created.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"cat > test_import_hook.py << 'EOF'\\\\nimport typeguard\\\\nfrom typeguard import config\\\\nimport sys\\\\n\\\\n# Enable debug instrumentation\\\\nconfig.debug_instrumentation = True\\\\n\\\\n# Install the import hook\\\\ntypeguard.install_import_hook()\\\\n\\\\n# Try to import a module\\\\nimport my_module # This will fail\\\\n\\\\nprint(\\\\\\\"Import successful!\\\\\\\")\\\\nprint(my_module.hello())\\\\nEOF\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 3\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed# echo 'def hello():\\n> return \\\"Hello, World!\\\"' > my_module.py; tmux wait -S done\\n(testbed) root@fde35d7355a5:/testbed# cat > test_import_hook.py << 'EOF'\\n> import typeguard\\n> from typeguard import config\\n> import sys\\n>\\n> # Enable debug instrumentation\\n> config.debug_instrumentation = True\\n>\\n> # Install the import hook\\n> typeguard.install_import_hook()\\n>\\n> # Try to import a module\\n> import my_module # This will fail\\n>\\n> print(\\\"Import successful!\\\")\\n> print(my_module.hello())\\n> EOF\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 4, "total_steps": 32}]}} diff --git a/resources_servers/terminus_format_server/requirements.txt b/resources_servers/terminus_format/requirements.txt similarity index 100% rename from resources_servers/terminus_format_server/requirements.txt rename to resources_servers/terminus_format/requirements.txt diff --git a/resources_servers/terminus_format_server/tests/test_app.py b/resources_servers/terminus_format/tests/test_app.py similarity index 100% rename from resources_servers/terminus_format_server/tests/test_app.py rename to resources_servers/terminus_format/tests/test_app.py diff --git a/resources_servers/terminus_format_server/README.md b/resources_servers/terminus_format_server/README.md deleted file mode 100644 index b56a438df..000000000 --- a/resources_servers/terminus_format_server/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Description - -Data links: ? - -# Licensing information -Code: ? -Data: ? - -Dependencies -- nemo_gym: Apache 2.0 -? From e23d73fef7c23b8d4dbe01c2151df7e37d6c0690 Mon Sep 17 00:00:00 2001 From: Khushi Bhardwaj Date: Wed, 19 Nov 2025 18:52:52 -0800 Subject: [PATCH 4/7] debugging Signed-off-by: Khushi Bhardwaj --- resources_servers/terminus_format/app.py | 158 ++++++++++++++++++++++- 1 file changed, 151 insertions(+), 7 deletions(-) diff --git a/resources_servers/terminus_format/app.py b/resources_servers/terminus_format/app.py index 6be0edef1..9dd25c69e 100644 --- a/resources_servers/terminus_format/app.py +++ b/resources_servers/terminus_format/app.py @@ -85,6 +85,81 @@ class TerminusFormatResourcesServerConfig(BaseResourcesServerConfig): } +COMMAND_BATCH_RESPONSE_SCHEMA = { + "title": "CommandBatchResponse", + "type": "object", + "additionalProperties": False, + "definitions": { + "Command": { + "title": "Command", + "type": "object", + "additionalProperties": False, + "properties": { + "keystrokes": { + "title": "Keystrokes", + "description": ( + "Keystrokes to execute in the terminal. Use tmux-style escape " + "sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys " + "must be sent as their own commands otherwise the characters will " + "be interpreted literally." + ), + "type": "string", + }, + "is_blocking": { + "title": "Is Blocking", + "description": ( + "Whether to wait for and return the terminal output after executing " + "these keystrokes. This will append '; tmux wait -S done' to your " + "command. DO NOT block on modifier keys or inside interactive " + "programs (e.g. vim or less). Only block when the command is " + "executed in the command line, is not interactive, and you expect " + "the output to be returned with no intervention. When in doubt, " + "wait instead of blocking." + ), + "type": "boolean", + }, + "timeout_sec": { + "title": "Timeout Sec", + "description": "The number of expected seconds to wait for the command to complete.", + "type": "number", + }, + }, + "required": ["keystrokes", "is_blocking", "timeout_sec"], + } + }, + "properties": { + "state_analysis": { + "title": "State Analysis", + "description": "Description of the current state of the terminal", + "type": "string", + }, + "explanation": { + "title": "Explanation", + "description": "Brief explanation of what these commands will do", + "type": "string", + }, + "commands": { + "title": "Commands", + "description": "List of shell interactions to execute in the Docker container", + "type": "array", + "items": { + "$ref": "#/definitions/Command", + }, + }, + "is_task_complete": { + "title": "Is Task Complete", + "description": ( + "Whether the task is complete following the execution of these commands. " + "Make sure to check that the command you last executed worked before " + "saying you're done." + ), + "type": "boolean", + }, + }, + "required": ["state_analysis", "explanation", "commands", "is_task_complete"], +} + + class TerminusFormatResourcesServer(SimpleResourcesServer): config: TerminusFormatResourcesServerConfig @@ -96,7 +171,52 @@ def setup_webserver(self) -> FastAPI: return app + # async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: + # assistant_responses = [] + # for output_item in body.response.output: + # if output_item.type != "message": + # continue + + # for content_item in output_item.content: + # if content_item.type != "output_text": + # continue + + # assistant_responses.append(content_item.text) + + # response_text = "".join(assistant_responses) + # print(response_text) + + # reward = self.evaluate_terminus_format_response_json(response_text) + # return BaseVerifyResponse(**body.model_dump(), reward=reward) + + # ----- JSON Helpers ----- # + # def evaluate_terminus_format_response_json(self, response_text: str) -> float: + # """Validate the model response against the fixed terminus format schema.""" + # try: + # response_obj = json.loads(response_text) + # except Exception: + # # Not valid JSON + # return 0.0 + + # try: + # validate_against_schema_openapi(response_obj, COMMAND_BATCH_RESPONSE_SCHEMA) + # except Exception: + # # JSON but does not match schema + # return 0.0 + + # # Valid JSON and matches schema + # return 1.0 + async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: + log_file = "validation_errors.txt" + + # Log that verify was called + with open(log_file, "a") as f: + f.write(f"\n{'=' * 80}\n") + f.write(f"TIMESTAMP: {__import__('datetime').datetime.now()}\n") + f.write("🔍 Verify method called\n") + f.write(f"Body: {body.model_dump()}\n") + assistant_responses = [] for output_item in body.response.output: if output_item.type != "message": @@ -109,27 +229,51 @@ async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: assistant_responses.append(content_item.text) response_text = "".join(assistant_responses) + + # Log what we extracted + with open(log_file, "a") as f: + f.write(f"Extracted response text length: {len(response_text)}\n") + f.write(f"Response text preview: {response_text[:200]}\n") + f.write(f"{'=' * 80}\n\n") + print(response_text) reward = self.evaluate_terminus_format_response_json(response_text) return BaseVerifyResponse(**body.model_dump(), reward=reward) - # ----- JSON Helpers ----- # def evaluate_terminus_format_response_json(self, response_text: str) -> float: """Validate the model response against the fixed terminus format schema.""" + log_file = "validation_errors.txt" + try: response_obj = json.loads(response_text) - except Exception: - # Not valid JSON + except Exception as e: + with open(log_file, "a") as f: + f.write(f"\n{'=' * 80}\n") + f.write(f"TIMESTAMP: {__import__('datetime').datetime.now()}\n") + f.write(f"❌ JSON parsing failed: {e}\n") + f.write(f"Response text: {response_text}\n") + f.write(f"{'=' * 80}\n\n") return 0.0 try: - validate_against_schema_openapi(response_obj, TERMINUS_FORMAT_SCHEMA) - except Exception: - # JSON but does not match schema + validate_against_schema_openapi(response_obj, COMMAND_BATCH_RESPONSE_SCHEMA) + except Exception as e: + with open(log_file, "a") as f: + f.write(f"\n{'=' * 80}\n") + f.write(f"TIMESTAMP: {__import__('datetime').datetime.now()}\n") + f.write(f"❌ Schema validation failed: {e}\n") + f.write(f"Response object: {json.dumps(response_obj, indent=2)}\n") + f.write(f"{'=' * 80}\n\n") return 0.0 - # Valid JSON and matches schema + # with open(log_file, "a") as f: + # f.write(f"\n{'='*80}\n") + # f.write(f"TIMESTAMP: {__import__('datetime').datetime.now()}\n") + # f.write(f"✅ Validation passed!\n") + # f.write(f"Response object: {json.dumps(response_obj, indent=2)}\n") + # f.write(f"{'='*80}\n\n") + return 1.0 From fde1dc2894eabfc3b2a70029f0ad962cd41b6da5 Mon Sep 17 00:00:00 2001 From: Khushi Bhardwaj Date: Thu, 20 Nov 2025 15:22:34 -0800 Subject: [PATCH 5/7] cleanup app, readme Signed-off-by: Khushi Bhardwaj --- resources_servers/terminus_format/README.md | 20 ++- resources_servers/terminus_format/app.py | 137 +------------------- 2 files changed, 22 insertions(+), 135 deletions(-) diff --git a/resources_servers/terminus_format/README.md b/resources_servers/terminus_format/README.md index 7227fe2e7..6e1f0cdf3 100644 --- a/resources_servers/terminus_format/README.md +++ b/resources_servers/terminus_format/README.md @@ -1,10 +1,12 @@ # Description -Data links: ? +This is a simple json format verifier environment intended to teach the model how to follow output formatting instructions. -# Licensing information -Code: ? -Data: ? +# Data + +The dataset was created by taking terminal-bench sft trajectories and slicing them into incremental step-by-step prefixes, each turned into a prompt asking the model for the next action, with outputs validated only for formatting correctness. + +Data: [Gitlab model registry link](https://gitlab-master.nvidia.com/bxyu/nemo-gym/-/ml/models/148/versions/170#/) Download this artifact: ```bash @@ -18,6 +20,9 @@ ng_download_dataset_from_gitlab \ Example data: `resources_servers/terminus_format/data/example.jsonl` +# Licensing information +? + # Example usage The following are example commands for running this resource server, along with the simple agent and an OpenAI model: @@ -39,3 +44,10 @@ ng_collect_rollouts \ Dependencies - nemo_gym: Apache 2.0 - openapi-schema-validator: [BSD-3-Clause license](https://github.com/python-openapi/openapi-schema-validator/blob/master/LICENSE) + + +# Next Steps + +- Add more template schemas +- Add more generalisable/diverse data +- Add stricter format validation during verification diff --git a/resources_servers/terminus_format/app.py b/resources_servers/terminus_format/app.py index 9dd25c69e..7d796440e 100644 --- a/resources_servers/terminus_format/app.py +++ b/resources_servers/terminus_format/app.py @@ -14,7 +14,6 @@ # limitations under the License. import json -from typing import Any, Dict from fastapi import FastAPI from openapi_schema_validator import validate as validate_against_schema_openapi @@ -31,60 +30,6 @@ class TerminusFormatResourcesServerConfig(BaseResourcesServerConfig): pass -# Fixed JSON schema for the terminal agent response. -TERMINUS_FORMAT_SCHEMA: Dict[str, Any] = { - "type": "object", - "title": "terminal_agent_response", - "properties": { - "analysis": {"type": "string"}, - "plan": {"type": "string"}, - "commands": { - "type": "array", - "items": { - "type": "object", - "properties": { - "keystrokes": {"type": "string"}, - "duration": { - "type": "number", - "default": 1.0, - "minimum": 0, - }, - }, - "required": ["keystrokes"], - "additionalProperties": False, - }, - }, - "task_complete": { - "type": "boolean", - "default": False, - }, - }, - "required": ["analysis", "plan", "commands"], - "additionalProperties": False, - # commands must be EITHER: - # - empty array: [] - # - OR array with ≥1 item (and keystrokes required per item) - "anyOf": [ - { - "properties": { - "commands": { - "type": "array", - "maxItems": 0, - } - } - }, - { - "properties": { - "commands": { - "type": "array", - "minItems": 1, - } - } - }, - ], -} - - COMMAND_BATCH_RESPONSE_SCHEMA = { "title": "CommandBatchResponse", "type": "object", @@ -171,52 +116,7 @@ def setup_webserver(self) -> FastAPI: return app - # async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: - # assistant_responses = [] - # for output_item in body.response.output: - # if output_item.type != "message": - # continue - - # for content_item in output_item.content: - # if content_item.type != "output_text": - # continue - - # assistant_responses.append(content_item.text) - - # response_text = "".join(assistant_responses) - # print(response_text) - - # reward = self.evaluate_terminus_format_response_json(response_text) - # return BaseVerifyResponse(**body.model_dump(), reward=reward) - - # ----- JSON Helpers ----- # - # def evaluate_terminus_format_response_json(self, response_text: str) -> float: - # """Validate the model response against the fixed terminus format schema.""" - # try: - # response_obj = json.loads(response_text) - # except Exception: - # # Not valid JSON - # return 0.0 - - # try: - # validate_against_schema_openapi(response_obj, COMMAND_BATCH_RESPONSE_SCHEMA) - # except Exception: - # # JSON but does not match schema - # return 0.0 - - # # Valid JSON and matches schema - # return 1.0 - async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: - log_file = "validation_errors.txt" - - # Log that verify was called - with open(log_file, "a") as f: - f.write(f"\n{'=' * 80}\n") - f.write(f"TIMESTAMP: {__import__('datetime').datetime.now()}\n") - f.write("🔍 Verify method called\n") - f.write(f"Body: {body.model_dump()}\n") - assistant_responses = [] for output_item in body.response.output: if output_item.type != "message": @@ -230,50 +130,25 @@ async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: response_text = "".join(assistant_responses) - # Log what we extracted - with open(log_file, "a") as f: - f.write(f"Extracted response text length: {len(response_text)}\n") - f.write(f"Response text preview: {response_text[:200]}\n") - f.write(f"{'=' * 80}\n\n") - - print(response_text) - reward = self.evaluate_terminus_format_response_json(response_text) return BaseVerifyResponse(**body.model_dump(), reward=reward) + # ----- JSON Helpers ----- # def evaluate_terminus_format_response_json(self, response_text: str) -> float: """Validate the model response against the fixed terminus format schema.""" - log_file = "validation_errors.txt" - try: response_obj = json.loads(response_text) - except Exception as e: - with open(log_file, "a") as f: - f.write(f"\n{'=' * 80}\n") - f.write(f"TIMESTAMP: {__import__('datetime').datetime.now()}\n") - f.write(f"❌ JSON parsing failed: {e}\n") - f.write(f"Response text: {response_text}\n") - f.write(f"{'=' * 80}\n\n") + except Exception: + # Not valid JSON return 0.0 try: validate_against_schema_openapi(response_obj, COMMAND_BATCH_RESPONSE_SCHEMA) - except Exception as e: - with open(log_file, "a") as f: - f.write(f"\n{'=' * 80}\n") - f.write(f"TIMESTAMP: {__import__('datetime').datetime.now()}\n") - f.write(f"❌ Schema validation failed: {e}\n") - f.write(f"Response object: {json.dumps(response_obj, indent=2)}\n") - f.write(f"{'=' * 80}\n\n") + except Exception: + # JSON but does not match schema return 0.0 - # with open(log_file, "a") as f: - # f.write(f"\n{'='*80}\n") - # f.write(f"TIMESTAMP: {__import__('datetime').datetime.now()}\n") - # f.write(f"✅ Validation passed!\n") - # f.write(f"Response object: {json.dumps(response_obj, indent=2)}\n") - # f.write(f"{'='*80}\n\n") - + # Valid JSON and matches schema return 1.0 From 5d8caba3152823ef86df31591dc8141b3fe3bbc6 Mon Sep 17 00:00:00 2001 From: Khushi Bhardwaj Date: Thu, 20 Nov 2025 15:50:02 -0800 Subject: [PATCH 6/7] add tests Signed-off-by: Khushi Bhardwaj --- .../terminus_format/tests/test_app.py | 97 +++++++++++++++++-- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/resources_servers/terminus_format/tests/test_app.py b/resources_servers/terminus_format/tests/test_app.py index 047b8dc6e..5992f5abe 100644 --- a/resources_servers/terminus_format/tests/test_app.py +++ b/resources_servers/terminus_format/tests/test_app.py @@ -5,28 +5,111 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json +from typing import Any from unittest.mock import MagicMock +from pytest import fixture + from nemo_gym.server_utils import ServerClient -from resources_servers.terminus_format_server.app import ( - TerminusFormatServerResourcesServer, - TerminusFormatServerResourcesServerConfig, +from resources_servers.terminus_format.app import ( + TerminusFormatResourcesServer, + TerminusFormatResourcesServerConfig, ) class TestApp: - def test_sanity(self) -> None: - config = TerminusFormatServerResourcesServerConfig( + @fixture + def config(self) -> TerminusFormatResourcesServerConfig: + return TerminusFormatResourcesServerConfig( host="0.0.0.0", port=8080, entrypoint="", name="", ) - TerminusFormatServerResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + @fixture + def server(self, config: TerminusFormatResourcesServerConfig) -> TerminusFormatResourcesServer: + server_client = MagicMock(spec=ServerClient) + return TerminusFormatResourcesServer(config=config, server_client=server_client) + + def _make_valid_payload(self, overrides: dict[str, Any] | None = None) -> str: + """Create a JSON string that matches COMMAND_BATCH_RESPONSE_SCHEMA, with optional overrides.""" + base: dict[str, Any] = { + "state_analysis": "Current directory contains project files.", + "explanation": "List files, then change into the project directory.", + "commands": [ + { + "keystrokes": "ls -la\n", + "is_blocking": True, + "timeout_sec": 5.0, + }, + { + "keystrokes": "cd project\n", + "is_blocking": False, + "timeout_sec": 2.0, + }, + ], + "is_task_complete": False, + } + if overrides: + # shallow update is enough for tests; nested modifications can pass dicts directly + base.update(overrides) + return json.dumps(base) + + def test_sanity_server_can_be_constructed(self, config: TerminusFormatResourcesServerConfig) -> None: + TerminusFormatResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + def test_evaluate_valid_json_returns_full_reward(self, server: TerminusFormatResourcesServer) -> None: + response_text = self._make_valid_payload() + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 1.0 + + def test_evaluate_invalid_json_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # Broken JSON (missing closing brace) + response_text = '{"state_analysis": "oops"' + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 + + def test_missing_required_field_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # Drop a required top-level field: "commands" + payload = json.loads(self._make_valid_payload()) + payload.pop("commands") + response_text = json.dumps(payload) + + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 + + def test_wrong_type_in_command_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # "timeout_sec" must be a number; here we make it a string + payload = json.loads(self._make_valid_payload()) + payload["commands"][0]["timeout_sec"] = "not-a-number" + response_text = json.dumps(payload) + + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 + + def test_extra_top_level_field_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # additionalProperties=False at the top level -> extra field should fail + payload = json.loads(self._make_valid_payload()) + payload["extra_field"] = "not allowed" + response_text = json.dumps(payload) + + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 + + def test_extra_field_in_command_object_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # Command definition also has additionalProperties=False + payload = json.loads(self._make_valid_payload()) + payload["commands"][0]["extra_field"] = "also not allowed" + response_text = json.dumps(payload) + + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 From d6ae991070119d51dff189190e0dce740cc398c0 Mon Sep 17 00:00:00 2001 From: Khushi Bhardwaj Date: Thu, 20 Nov 2025 15:52:12 -0800 Subject: [PATCH 7/7] end newline Signed-off-by: Khushi Bhardwaj --- resources_servers/terminus_format/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources_servers/terminus_format/requirements.txt b/resources_servers/terminus_format/requirements.txt index 3f80e8452..75e60c851 100644 --- a/resources_servers/terminus_format/requirements.txt +++ b/resources_servers/terminus_format/requirements.txt @@ -1,2 +1,2 @@ -e nemo-gym[dev] @ ../../ -openapi-schema-validator==0.6.3 \ No newline at end of file +openapi-schema-validator==0.6.3