diff --git a/resources_servers/terminus_format/README.md b/resources_servers/terminus_format/README.md new file mode 100644 index 000000000..6e1f0cdf3 --- /dev/null +++ b/resources_servers/terminus_format/README.md @@ -0,0 +1,53 @@ +# Description + +This is a simple json format verifier environment intended to teach the model how to follow output formatting instructions. + +# Data + +The dataset was created by taking terminal-bench sft trajectories and slicing them into incremental step-by-step prefixes, each turned into a prompt asking the model for the next action, with outputs validated only for formatting correctness. + +Data: [Gitlab model registry link](https://gitlab-master.nvidia.com/bxyu/nemo-gym/-/ml/models/148/versions/170#/) + +Download this artifact: +```bash +ng_download_dataset_from_gitlab \ + +dataset_name=terminus_format_dataset \ + +version=0.0.1 \ + +artifact_fpath=example.jsonl \ + +output_fpath=resources_servers/terminus_format/data/example.jsonl +``` + +Example data: +`resources_servers/terminus_format/data/example.jsonl` + +# Licensing information +? + +# Example usage + +The following are example commands for running this resource server, along with the simple agent and an OpenAI model: +```bash +config_paths="responses_api_models/openai_model/configs/openai_model.yaml, \ +resources_servers/terminus_format/configs/terminus_format.yaml" +ng_run "+config_paths=[$config_paths]" +``` + +Then, rollouts can be collected using a command such as the following: +```bash +ng_collect_rollouts \ + +agent_name=terminus_format_simple_agent \ + +input_jsonl_fpath=resources_servers/terminus_format/data/example.jsonl \ + +output_jsonl_fpath=results/example_terminus_format_json.jsonl \ + +limit=1 +``` + +Dependencies +- nemo_gym: Apache 2.0 +- openapi-schema-validator: [BSD-3-Clause license](https://github.com/python-openapi/openapi-schema-validator/blob/master/LICENSE) + + +# Next Steps + +- Add more template schemas +- Add more generalisable/diverse data +- Add stricter format validation during verification diff --git a/resources_servers/terminus_format/app.py b/resources_servers/terminus_format/app.py new file mode 100644 index 000000000..7d796440e --- /dev/null +++ b/resources_servers/terminus_format/app.py @@ -0,0 +1,156 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +from fastapi import FastAPI +from openapi_schema_validator import validate as validate_against_schema_openapi + +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseVerifyRequest, + BaseVerifyResponse, + SimpleResourcesServer, +) + + +class TerminusFormatResourcesServerConfig(BaseResourcesServerConfig): + pass + + +COMMAND_BATCH_RESPONSE_SCHEMA = { + "title": "CommandBatchResponse", + "type": "object", + "additionalProperties": False, + "definitions": { + "Command": { + "title": "Command", + "type": "object", + "additionalProperties": False, + "properties": { + "keystrokes": { + "title": "Keystrokes", + "description": ( + "Keystrokes to execute in the terminal. Use tmux-style escape " + "sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys " + "must be sent as their own commands otherwise the characters will " + "be interpreted literally." + ), + "type": "string", + }, + "is_blocking": { + "title": "Is Blocking", + "description": ( + "Whether to wait for and return the terminal output after executing " + "these keystrokes. This will append '; tmux wait -S done' to your " + "command. DO NOT block on modifier keys or inside interactive " + "programs (e.g. vim or less). Only block when the command is " + "executed in the command line, is not interactive, and you expect " + "the output to be returned with no intervention. When in doubt, " + "wait instead of blocking." + ), + "type": "boolean", + }, + "timeout_sec": { + "title": "Timeout Sec", + "description": "The number of expected seconds to wait for the command to complete.", + "type": "number", + }, + }, + "required": ["keystrokes", "is_blocking", "timeout_sec"], + } + }, + "properties": { + "state_analysis": { + "title": "State Analysis", + "description": "Description of the current state of the terminal", + "type": "string", + }, + "explanation": { + "title": "Explanation", + "description": "Brief explanation of what these commands will do", + "type": "string", + }, + "commands": { + "title": "Commands", + "description": "List of shell interactions to execute in the Docker container", + "type": "array", + "items": { + "$ref": "#/definitions/Command", + }, + }, + "is_task_complete": { + "title": "Is Task Complete", + "description": ( + "Whether the task is complete following the execution of these commands. " + "Make sure to check that the command you last executed worked before " + "saying you're done." + ), + "type": "boolean", + }, + }, + "required": ["state_analysis", "explanation", "commands", "is_task_complete"], +} + + +class TerminusFormatResourcesServer(SimpleResourcesServer): + config: TerminusFormatResourcesServerConfig + + def setup_webserver(self) -> FastAPI: + app = super().setup_webserver() + + # Additional server routes go here! e.g.: + # app.post("/get_weather")(self.get_weather) + + return app + + async def verify(self, body: BaseVerifyRequest) -> BaseVerifyResponse: + assistant_responses = [] + for output_item in body.response.output: + if output_item.type != "message": + continue + + for content_item in output_item.content: + if content_item.type != "output_text": + continue + + assistant_responses.append(content_item.text) + + response_text = "".join(assistant_responses) + + reward = self.evaluate_terminus_format_response_json(response_text) + return BaseVerifyResponse(**body.model_dump(), reward=reward) + + # ----- JSON Helpers ----- # + def evaluate_terminus_format_response_json(self, response_text: str) -> float: + """Validate the model response against the fixed terminus format schema.""" + try: + response_obj = json.loads(response_text) + except Exception: + # Not valid JSON + return 0.0 + + try: + validate_against_schema_openapi(response_obj, COMMAND_BATCH_RESPONSE_SCHEMA) + except Exception: + # JSON but does not match schema + return 0.0 + + # Valid JSON and matches schema + return 1.0 + + +if __name__ == "__main__": + TerminusFormatResourcesServer.run_webserver() diff --git a/resources_servers/terminus_format/configs/terminus_format.yaml b/resources_servers/terminus_format/configs/terminus_format.yaml new file mode 100644 index 000000000..5c77cbcb9 --- /dev/null +++ b/resources_servers/terminus_format/configs/terminus_format.yaml @@ -0,0 +1,39 @@ +terminus_format: + resources_servers: + terminus_format: + entrypoint: app.py + domain: instruction_following + verified: false +terminus_format_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: terminus_format + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/terminus_format/data/train.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: terminus_format + version: 0.0.1 + artifact_fpath: train.jsonl + license: Apache 2.0 + - name: validation + type: validation + jsonl_fpath: resources_servers/terminus_format/data/validation.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: terminus_format + version: 0.0.1 + artifact_fpath: validation.jsonl + license: Apache 2.0 + - name: example + type: example + jsonl_fpath: resources_servers/terminus_format/data/example.jsonl + num_repeats: 1 diff --git a/resources_servers/terminus_format/data/.gitignore b/resources_servers/terminus_format/data/.gitignore new file mode 100644 index 000000000..4424b6fde --- /dev/null +++ b/resources_servers/terminus_format/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/resources_servers/terminus_format/data/example.jsonl b/resources_servers/terminus_format/data/example.jsonl new file mode 100644 index 000000000..dfbff59da --- /dev/null +++ b/resources_servers/terminus_format/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 0, "total_steps": 32}]}} +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I'm at the root prompt in a testbed environment. I need to first check what Python version is installed and then create a test case to reproduce the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll check the Python version and install typeguard if it's not already installed.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python --version\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 1, "total_steps": 32}]}} +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I'm at the root prompt in a testbed environment. I need to first check what Python version is installed and then create a test case to reproduce the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll check the Python version and install typeguard if it's not already installed.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python --version\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I've confirmed that Python 3.10.16 is installed, which matches the environment described in the task. Now I need to check if typeguard is installed and install it if needed.\\\",\\n \\\"explanation\\\": \\\"I'll check if typeguard is installed by trying to import it in Python. If it's not installed, I'll install it using pip.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python -c \\\\\\\"import typeguard; print('typeguard is installed')\\\\\\\"\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 3\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 2, "total_steps": 32}]}} +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I'm at the root prompt in a testbed environment. I need to first check what Python version is installed and then create a test case to reproduce the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll check the Python version and install typeguard if it's not already installed.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python --version\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I've confirmed that Python 3.10.16 is installed, which matches the environment described in the task. Now I need to check if typeguard is installed and install it if needed.\\\",\\n \\\"explanation\\\": \\\"I'll check if typeguard is installed by trying to import it in Python. If it's not installed, I'll install it using pip.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python -c \\\\\\\"import typeguard; print('typeguard is installed')\\\\\\\"\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 3\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"Typeguard is already installed in the environment. Now I need to create a simple Python module to test with and then create a script that reproduces the issue described in the task.\\\",\\n \\\"explanation\\\": \\\"I'll create a simple Python module called 'my_module.py' with some basic code, then create a test script that enables debug_instrumentation and tries to import the module.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"echo 'def hello():\\\\n return \\\\\\\"Hello, World!\\\\\\\"' > my_module.py\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed# echo 'def hello():\\n> return \\\"Hello, World!\\\"' > my_module.py; tmux wait -S done\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 3, "total_steps": 32}]}} +{"responses_create_params": {"input": [{"role": "user", "content": "You are an AI assistant completing the next step for this terminal-based coding task.\n\nHere is the status of the agent so far (as a list of messages in JSON):\n\n[\n {\n \"role\": \"user\",\n \"content\": \"You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task instruction and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.\\n\\nFor each response:\\n1. Analyze the current state based on any terminal output provided\\n2. Determine the next set of commands needed to make progress\\n3. Decide if you need to see the output of these commands before proceeding\\n\\nInstruction:\\n# Import hook breaks when debug_instrumentation is enabled\\n\\n## Description\\n\\nWhen using the import hook with debug_instrumentation enabled, the code fails to properly instrument modules. The issue appears to be in the `_call_with_frames_removed` function which is reversing the order of positional arguments, causing AST parsing to fail.\\n\\n## Steps to reproduce\\n\\n1. Create a simple Python module\\n2. Enable debug_instrumentation\\n3. Install the import hook\\n4. Try to import the module\\n\\n```python\\nimport typeguard\\nfrom typeguard import config\\nimport sys\\n\\n# Enable debug instrumentation\\nconfig.debug_instrumentation = True\\n\\n# Install the import hook\\ntypeguard.install_import_hook()\\n\\n# Try to import a module\\nimport my_module # This will fail\\n```\\n\\n## Error output\\n\\nWhen trying to import a module with debug_instrumentation enabled, you'll see an error like:\\n\\n```\\nTraceback (most recent call last):\\n File \\\"/path/to/script.py\\\", line 10, in \\n import my_module\\n File \\\"/usr/lib/python3.10/importlib/_bootstrap_external.py\\\", line 883, in exec_module\\n exec(code, module.__dict__)\\n File \\\"\\\", line 241, in _call_with_frames_removed\\nTypeError: expected str, bytes or os.PathLike object, not AST\\n```\\n\\nThe issue is that the arguments to `ast.parse` are being passed in the wrong order, causing the function to receive the path as the source code and vice versa.\\n\\n## Environment\\n\\n- Python 3.10\\n- typeguard latest version\\n\\n## Additional information\\n\\nThe issue seems to be related to the `_call_with_frames_removed` function which is reversing the order of positional arguments. This causes problems when calling functions like `ast.parse` where the order of arguments matters.\\n\\n\\nYour response must be a JSON object that matches this schema:\\n\\n{\\n \\\"$defs\\\": {\\n \\\"Command\\\": {\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"keystrokes\\\": {\\n \\\"description\\\": \\\"Keystrokes to execute in the terminal. Use tmux-style escape sequences for modifier keys (e.g. C-c for ctrl-c). Modifier keys must be sent as their own commands otherwise the characters will be interpreted literally.\\\",\\n \\\"title\\\": \\\"Keystrokes\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"is_blocking\\\": {\\n \\\"description\\\": \\\"Whether to wait for and return the terminal output after executing these keystrokes. This will append '; tmux wait -S done' to your command.DO NOT block on modifier keys or inside interactive programs (e.g. vim or less). Only block when the command is executed in the command line, is not interactive, and you expect the output to be returned with no intervention. When in doubt, wait instead of blocking.\\\",\\n \\\"title\\\": \\\"Is Blocking\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n },\\n \\\"timeout_sec\\\": {\\n \\\"description\\\": \\\"The number of expected seconds to wait for the command to complete.\\\",\\n \\\"title\\\": \\\"Timeout Sec\\\",\\n \\\"type\\\": \\\"number\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"keystrokes\\\",\\n \\\"is_blocking\\\",\\n \\\"timeout_sec\\\"\\n ],\\n \\\"title\\\": \\\"Command\\\",\\n \\\"type\\\": \\\"object\\\"\\n }\\n },\\n \\\"additionalProperties\\\": false,\\n \\\"properties\\\": {\\n \\\"state_analysis\\\": {\\n \\\"description\\\": \\\"Description of the current state of the terminal\\\",\\n \\\"title\\\": \\\"State Analysis\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"explanation\\\": {\\n \\\"description\\\": \\\"Brief explanation of what these commands will do\\\",\\n \\\"title\\\": \\\"Explanation\\\",\\n \\\"type\\\": \\\"string\\\"\\n },\\n \\\"commands\\\": {\\n \\\"description\\\": \\\"List of shell interactions to execute in the Docker container\\\",\\n \\\"items\\\": {\\n \\\"$ref\\\": \\\"#/$defs/Command\\\"\\n },\\n \\\"title\\\": \\\"Commands\\\",\\n \\\"type\\\": \\\"array\\\"\\n },\\n \\\"is_task_complete\\\": {\\n \\\"description\\\": \\\"Whether the task is complete following the execution of these commands. Make sure to check that the command you last executed worked before saying you're done.\\\",\\n \\\"title\\\": \\\"Is Task Complete\\\",\\n \\\"type\\\": \\\"boolean\\\"\\n }\\n },\\n \\\"required\\\": [\\n \\\"state_analysis\\\",\\n \\\"explanation\\\",\\n \\\"commands\\\",\\n \\\"is_task_complete\\\"\\n ],\\n \\\"title\\\": \\\"CommandBatchResponse\\\",\\n \\\"type\\\": \\\"object\\\"\\n}\\n\\nDon't include markdown formatting.\\n\\nNote that you operate directly on the terminal from inside a tmux session. Use tmux keystrokes like `C-x` or `Escape` to interactively navigate the terminal. If you would like to execute a command that you have written you will need to append a newline character to the end of your command.\\n\\nFor example, if you write \\\"ls -la\\\" you will need to append a newline character to the end of your command like this: `ls -la\\\\n`.\\n\\nOne thing to be very careful about is handling interactive sessions like less, vim, or git diff. In these cases, you should not wait for the output of the command. Instead, you should send the keystrokes to the terminal as if you were typing them.\\n\\nThe current terminal state is:\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I'm at the root prompt in a testbed environment. I need to first check what Python version is installed and then create a test case to reproduce the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll check the Python version and install typeguard if it's not already installed.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python --version\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I've confirmed that Python 3.10.16 is installed, which matches the environment described in the task. Now I need to check if typeguard is installed and install it if needed.\\\",\\n \\\"explanation\\\": \\\"I'll check if typeguard is installed by trying to import it in Python. If it's not installed, I'll install it using pip.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"python -c \\\\\\\"import typeguard; print('typeguard is installed')\\\\\\\"\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 3\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"Typeguard is already installed in the environment. Now I need to create a simple Python module to test with and then create a script that reproduces the issue described in the task.\\\",\\n \\\"explanation\\\": \\\"I'll create a simple Python module called 'my_module.py' with some basic code, then create a test script that enables debug_instrumentation and tries to import the module.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"echo 'def hello():\\\\n return \\\\\\\"Hello, World!\\\\\\\"' > my_module.py\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 2\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed# echo 'def hello():\\n> return \\\"Hello, World!\\\"' > my_module.py; tmux wait -S done\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n },\n {\n \"role\": \"assistant\",\n \"content\": \"{\\n \\\"state_analysis\\\": \\\"I've created a simple Python module called 'my_module.py'. Now I need to create a test script that reproduces the issue with typeguard's import hook and debug_instrumentation.\\\",\\n \\\"explanation\\\": \\\"I'll create a test script that enables debug_instrumentation, installs the import hook, and then tries to import the module we just created.\\\",\\n \\\"commands\\\": [\\n {\\n \\\"keystrokes\\\": \\\"cat > test_import_hook.py << 'EOF'\\\\nimport typeguard\\\\nfrom typeguard import config\\\\nimport sys\\\\n\\\\n# Enable debug instrumentation\\\\nconfig.debug_instrumentation = True\\\\n\\\\n# Install the import hook\\\\ntypeguard.install_import_hook()\\\\n\\\\n# Try to import a module\\\\nimport my_module # This will fail\\\\n\\\\nprint(\\\\\\\"Import successful!\\\\\\\")\\\\nprint(my_module.hello())\\\\nEOF\\\\n\\\",\\n \\\"is_blocking\\\": true,\\n \\\"timeout_sec\\\": 3\\n }\\n ],\\n \\\"is_task_complete\\\": false\\n}\"\n },\n {\n \"role\": \"user\",\n \"content\": \"(testbed) root@fde35d7355a5:/testbed# python --version; tmux wait -S done\\nPython 3.10.16\\n(testbed) root@fde35d7355a5:/testbed# python -c \\\"import typeguard; print('typeguard is installed')\\\"; tmux wait -S done\\ntypeguard is installed\\n(testbed) root@fde35d7355a5:/testbed# echo 'def hello():\\n> return \\\"Hello, World!\\\"' > my_module.py; tmux wait -S done\\n(testbed) root@fde35d7355a5:/testbed# cat > test_import_hook.py << 'EOF'\\n> import typeguard\\n> from typeguard import config\\n> import sys\\n>\\n> # Enable debug instrumentation\\n> config.debug_instrumentation = True\\n>\\n> # Install the import hook\\n> typeguard.install_import_hook()\\n>\\n> # Try to import a module\\n> import my_module # This will fail\\n>\\n> print(\\\"Import successful!\\\")\\n> print(my_module.hello())\\n> EOF\\n(testbed) root@fde35d7355a5:/testbed#\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"\n }\n]\n\nPlease write the next step in the expected output format.\nRespond ONLY with the JSON object for the next step, in the required schema.\n", "source_idx": 0, "slice_idx": 4, "total_steps": 32}]}} diff --git a/resources_servers/terminus_format/requirements.txt b/resources_servers/terminus_format/requirements.txt new file mode 100644 index 000000000..75e60c851 --- /dev/null +++ b/resources_servers/terminus_format/requirements.txt @@ -0,0 +1,2 @@ +-e nemo-gym[dev] @ ../../ +openapi-schema-validator==0.6.3 diff --git a/resources_servers/terminus_format/tests/test_app.py b/resources_servers/terminus_format/tests/test_app.py new file mode 100644 index 000000000..5992f5abe --- /dev/null +++ b/resources_servers/terminus_format/tests/test_app.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +from typing import Any +from unittest.mock import MagicMock + +from pytest import fixture + +from nemo_gym.server_utils import ServerClient +from resources_servers.terminus_format.app import ( + TerminusFormatResourcesServer, + TerminusFormatResourcesServerConfig, +) + + +class TestApp: + @fixture + def config(self) -> TerminusFormatResourcesServerConfig: + return TerminusFormatResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + ) + + @fixture + def server(self, config: TerminusFormatResourcesServerConfig) -> TerminusFormatResourcesServer: + server_client = MagicMock(spec=ServerClient) + return TerminusFormatResourcesServer(config=config, server_client=server_client) + + def _make_valid_payload(self, overrides: dict[str, Any] | None = None) -> str: + """Create a JSON string that matches COMMAND_BATCH_RESPONSE_SCHEMA, with optional overrides.""" + base: dict[str, Any] = { + "state_analysis": "Current directory contains project files.", + "explanation": "List files, then change into the project directory.", + "commands": [ + { + "keystrokes": "ls -la\n", + "is_blocking": True, + "timeout_sec": 5.0, + }, + { + "keystrokes": "cd project\n", + "is_blocking": False, + "timeout_sec": 2.0, + }, + ], + "is_task_complete": False, + } + if overrides: + # shallow update is enough for tests; nested modifications can pass dicts directly + base.update(overrides) + return json.dumps(base) + + def test_sanity_server_can_be_constructed(self, config: TerminusFormatResourcesServerConfig) -> None: + TerminusFormatResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + def test_evaluate_valid_json_returns_full_reward(self, server: TerminusFormatResourcesServer) -> None: + response_text = self._make_valid_payload() + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 1.0 + + def test_evaluate_invalid_json_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # Broken JSON (missing closing brace) + response_text = '{"state_analysis": "oops"' + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 + + def test_missing_required_field_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # Drop a required top-level field: "commands" + payload = json.loads(self._make_valid_payload()) + payload.pop("commands") + response_text = json.dumps(payload) + + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 + + def test_wrong_type_in_command_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # "timeout_sec" must be a number; here we make it a string + payload = json.loads(self._make_valid_payload()) + payload["commands"][0]["timeout_sec"] = "not-a-number" + response_text = json.dumps(payload) + + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 + + def test_extra_top_level_field_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # additionalProperties=False at the top level -> extra field should fail + payload = json.loads(self._make_valid_payload()) + payload["extra_field"] = "not allowed" + response_text = json.dumps(payload) + + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0 + + def test_extra_field_in_command_object_returns_zero(self, server: TerminusFormatResourcesServer) -> None: + # Command definition also has additionalProperties=False + payload = json.loads(self._make_valid_payload()) + payload["commands"][0]["extra_field"] = "also not allowed" + response_text = json.dumps(payload) + + reward = server.evaluate_terminus_format_response_json(response_text) + assert reward == 0.0