From 9a478effba7d3b837765ace2de51fd277c066b41 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 20 Jan 2026 21:37:03 -0800 Subject: [PATCH 001/127] print client response error on debug Signed-off-by: Brian Yu --- nemo_gym/server_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index a7ca1f2f8..58ae1337f 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -433,6 +433,9 @@ async def exception_handling_middleware(request: Request, call_next): ) response_content = f"Hit an exception in {self.get_session_middleware_key()} calling an inner server: {e.response_content}" + if _GLOBAL_AIOHTTP_CLIENT_REQUEST_DEBUG: + print(response_content) + return JSONResponse(content=response_content, status_code=500) except Exception as e: print( From 5c0f1588817545326c592a5c7d7783a0a8a84536 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:08:46 -0800 Subject: [PATCH 002/127] print result Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index a1e1557aa..610ed4faa 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -212,6 +212,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() future = runner_ray_remote.remote(run_swebench_evaluation, params) result = await future + # TODO remove + print(result) + # Extract trajectory and convert to proper NeMoGym format output_items = [] trajectory = result.get("trajectory", []) From ec0f4d4bf31f47bff5876e9d4baf0a2518a1f98c Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:15:53 -0800 Subject: [PATCH 003/127] print params Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 610ed4faa..5e7cd5d12 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -209,6 +209,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "instance_dir": instance_dir, } + # TODO remove + print(params) + future = runner_ray_remote.remote(run_swebench_evaluation, params) result = await future From f3b8e3bc777421ebaa76437dbcb8814847dc9ca8 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:18:20 -0800 Subject: [PATCH 004/127] pritn Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 376b4f153..939808e16 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -684,6 +684,10 @@ async def run_swebench_evaluation( r2e_gym_setup_dir=r2e_gym_setup_dir, dataset_path=dataset_path, ) + + # TODO remove + print("Hit before run_oh") + result = await run_oh.process_single_datapoint(problem_info) print(f"Process completed for {instance_id}", flush=True) From 8b9b2877807ba2320db27b174e256ca980ddf4a6 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:23:00 -0800 Subject: [PATCH 005/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 939808e16..32527527c 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -640,6 +640,9 @@ async def run_swebench_evaluation( dataset_path: Optional[str] = None, instance_dir: Optional[str] = None, ) -> Dict: + # TODO remove + print("Hit inside") + # Create persistent directory for I/O and logs in local workspace workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) instance_id = problem_info.get("instance_id", "unknown") From 86b6fd91180739a402e013dd095355ef47fcb899 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:24:37 -0800 Subject: [PATCH 006/127] clean Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 6 ------ responses_api_agents/swe_agents/utils.py | 6 ------ 2 files changed, 12 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 5e7cd5d12..a1e1557aa 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -209,15 +209,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "instance_dir": instance_dir, } - # TODO remove - print(params) - future = runner_ray_remote.remote(run_swebench_evaluation, params) result = await future - # TODO remove - print(result) - # Extract trajectory and convert to proper NeMoGym format output_items = [] trajectory = result.get("trajectory", []) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 32527527c..b8cd5604f 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -640,9 +640,6 @@ async def run_swebench_evaluation( dataset_path: Optional[str] = None, instance_dir: Optional[str] = None, ) -> Dict: - # TODO remove - print("Hit inside") - # Create persistent directory for I/O and logs in local workspace workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) instance_id = problem_info.get("instance_id", "unknown") @@ -688,9 +685,6 @@ async def run_swebench_evaluation( dataset_path=dataset_path, ) - # TODO remove - print("Hit before run_oh") - result = await run_oh.process_single_datapoint(problem_info) print(f"Process completed for {instance_id}", flush=True) From f46109f20918e66e686697af494f53c6907e4f5a Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:25:41 -0800 Subject: [PATCH 007/127] try traceback Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index a1e1557aa..0cc7a6729 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -298,6 +298,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() tools=[], metadata={"error": str(e)}, ) + except: + import traceback + + traceback.print_exc() async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: """Run and verify SWE-bench solution.""" From 8971ad50ce1e5af5d963b8d14556c14da5d53816 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:27:19 -0800 Subject: [PATCH 008/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 0cc7a6729..ee723a42e 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -60,6 +60,8 @@ }, ) def runner_ray_remote(runner: Callable, params: dict[str, Any]) -> Any: + # TODO remove + print("Hit in runner_ray_remote") return asyncio.run(runner(**params)) @@ -298,10 +300,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() tools=[], metadata={"error": str(e)}, ) - except: - import traceback - - traceback.print_exc() async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: """Run and verify SWE-bench solution.""" From 11eef9042ebc3ba3175337b763495980350dcf46 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:29:11 -0800 Subject: [PATCH 009/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index ee723a42e..da3d5921a 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -60,8 +60,6 @@ }, ) def runner_ray_remote(runner: Callable, params: dict[str, Any]) -> Any: - # TODO remove - print("Hit in runner_ray_remote") return asyncio.run(runner(**params)) @@ -212,6 +210,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() } future = runner_ray_remote.remote(run_swebench_evaluation, params) + # TODO remove + print("FUTURE", future) + result = await future # Extract trajectory and convert to proper NeMoGym format From 6d384fc669d58e114260f0341fdb183ed07fa8d0 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:33:36 -0800 Subject: [PATCH 010/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index da3d5921a..23fd354d8 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -209,6 +209,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "instance_dir": instance_dir, } + print("BEFORE REMOTE", runner_ray_remote, run_swebench_evaluation, len(params)) future = runner_ray_remote.remote(run_swebench_evaluation, params) # TODO remove print("FUTURE", future) From 91271d5f0c52f106a6ae7c6c5c4880d1dcad3acb Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:35:24 -0800 Subject: [PATCH 011/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 23fd354d8..c6e0009fc 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -209,10 +209,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "instance_dir": instance_dir, } - print("BEFORE REMOTE", runner_ray_remote, run_swebench_evaluation, len(params)) + print("BEFORE REMOTE", runner_ray_remote, run_swebench_evaluation, len(params), flush=True) future = runner_ray_remote.remote(run_swebench_evaluation, params) # TODO remove - print("FUTURE", future) + print("FUTURE", future, flush=True) result = await future @@ -306,6 +306,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: """Run and verify SWE-bench solution.""" async with self.sem: + # TODO remove + print("hit inside run", flush=True) + # Fix None values in responses_create_params to use defaults # This is needed because the pydantic model has non-Optional fields with defaults From 35cb2281e5f8fd1a082be339e1e7e293877a6501 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:36:57 -0800 Subject: [PATCH 012/127] clean Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index c6e0009fc..a1e1557aa 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -209,11 +209,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "instance_dir": instance_dir, } - print("BEFORE REMOTE", runner_ray_remote, run_swebench_evaluation, len(params), flush=True) future = runner_ray_remote.remote(run_swebench_evaluation, params) - # TODO remove - print("FUTURE", future, flush=True) - result = await future # Extract trajectory and convert to proper NeMoGym format @@ -306,9 +302,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: """Run and verify SWE-bench solution.""" async with self.sem: - # TODO remove - print("hit inside run", flush=True) - # Fix None values in responses_create_params to use defaults # This is needed because the pydantic model has non-Optional fields with defaults From cb0ff0b842bbde6fbe551d85221d4421afa6163c Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:37:43 -0800 Subject: [PATCH 013/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index a1e1557aa..8bb414331 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -212,6 +212,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() future = runner_ray_remote.remote(run_swebench_evaluation, params) result = await future + # TODO remove + print("RESULT", result, flush=True) + # Extract trajectory and convert to proper NeMoGym format output_items = [] trajectory = result.get("trajectory", []) @@ -277,6 +280,11 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() ) except Exception as e: + # TODO remove + import traceback + + print(traceback.format_exc(), flush=True) + print(f"SWE-bench evaluation failed: {str(e)}", flush=True) # Return error response error_message = NeMoGymResponseOutputMessage( @@ -298,6 +306,11 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() tools=[], metadata={"error": str(e)}, ) + except: + # TODO remove + import traceback + + print(traceback.format_exc(), flush=True) async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: """Run and verify SWE-bench solution.""" From badddbaac26811ce9e4c1dfbb69ade88039de43a Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:40:04 -0800 Subject: [PATCH 014/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index b8cd5604f..bd2146838 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -640,6 +640,9 @@ async def run_swebench_evaluation( dataset_path: Optional[str] = None, instance_dir: Optional[str] = None, ) -> Dict: + # TODO remove + print("Hit inside run_swebench_evaluation", problem_info, flush=True) + # Create persistent directory for I/O and logs in local workspace workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) instance_id = problem_info.get("instance_id", "unknown") @@ -685,6 +688,8 @@ async def run_swebench_evaluation( dataset_path=dataset_path, ) + # TODO remove + print("Hit before process_single_datapoint", problem_info, flush=True) result = await run_oh.process_single_datapoint(problem_info) print(f"Process completed for {instance_id}", flush=True) From 4e239f74e760b05a9070f780d911490e46668c63 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:43:18 -0800 Subject: [PATCH 015/127] print exc Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index bd2146838..d691551ce 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -640,9 +640,6 @@ async def run_swebench_evaluation( dataset_path: Optional[str] = None, instance_dir: Optional[str] = None, ) -> Dict: - # TODO remove - print("Hit inside run_swebench_evaluation", problem_info, flush=True) - # Create persistent directory for I/O and logs in local workspace workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) instance_id = problem_info.get("instance_id", "unknown") @@ -688,9 +685,14 @@ async def run_swebench_evaluation( dataset_path=dataset_path, ) - # TODO remove - print("Hit before process_single_datapoint", problem_info, flush=True) - result = await run_oh.process_single_datapoint(problem_info) + try: + result = await run_oh.process_single_datapoint(problem_info) + except: + # TODO remove + import traceback + + print("Hit exception in process_single_datapoint", traceback.format_exc(), flush=True) + print(f"Process completed for {instance_id}", flush=True) try: From 890376e4322d726a16a9cb585d41b7f4eaf8171f Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:44:50 -0800 Subject: [PATCH 016/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 5 +++++ responses_api_agents/swe_agents/utils.py | 9 +-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 16e30ec4f..ea01ab73c 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -1025,5 +1025,10 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): } return output_dict + except: + # TODO remove + import traceback + + print("Hit exception in process_single_datapoint", traceback.format_exc(), flush=True) finally: self._cleanup_instance_dataset(instance_dataset_path) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index d691551ce..b8cd5604f 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -685,14 +685,7 @@ async def run_swebench_evaluation( dataset_path=dataset_path, ) - try: - result = await run_oh.process_single_datapoint(problem_info) - except: - # TODO remove - import traceback - - print("Hit exception in process_single_datapoint", traceback.format_exc(), flush=True) - + result = await run_oh.process_single_datapoint(problem_info) print(f"Process completed for {instance_id}", flush=True) try: From 702e2998f9e5c5223aa712e1aa1da2b954a158c5 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:49:32 -0800 Subject: [PATCH 017/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index ea01ab73c..bcf677bec 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -909,6 +909,7 @@ def check_tests_passed( return required_tests <= passed_tests async def process_single_datapoint(self, data_point: dict[str, Any]): + print("HIT 1", flush=True) self.output_dir = Path(self.cfg.output_file).parent agent_run_id = f"{data_point['instance_id']}_{int(time.time())}_{str(uuid.uuid4())[:8]}" @@ -921,12 +922,14 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): trajectory_dict = None try: if self.cfg.agent_framework == SupportedAgentFrameworks.swe_agent: + print("HIT 2", flush=True) pred_file = await self._run_swe_agent( data_point, api_base, instance_dataset_path, ) elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands: + print("HIT 3", flush=True) pred_file = await self._run_openhands( data_point, api_base, @@ -934,14 +937,17 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): instance_dataset_path, ) else: + print("HIT 4", flush=True) raise ValueError( f"Unsupported agent framework: {self.cfg.agent_framework}. " f"Supported frameworks: {', '.join(SupportedAgentFrameworks)}." ) + print("HIT 5", flush=True) generation_time = asyncio.get_running_loop().time() - start_time if pred_file is None: + print("HIT 6", flush=True) report_json = { data_point["instance_id"]: { "resolved": False, @@ -952,6 +958,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): } } else: + print("HIT 7", flush=True) pred_mounted_path = pred_file.replace(str(self.output_dir), "/trajectories_mount") with open(pred_file, "r") as f: trajectory_dict = json.loads(f.read()) @@ -960,6 +967,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): has_patch = trajectory_dict["model_patch"] is not None if not has_patch: + print("HIT 8", flush=True) report_json = { data_point["instance_id"]: { "resolved": False, @@ -971,17 +979,20 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): } else: + print("HIT 9", flush=True) # Run full evaluation with streaming output # TODO: should we fail on errors here? Seems that json isn't always generated try: start_time = asyncio.get_running_loop().time() if data_point["dataset_name"] == "nv-internal-1": + print("HIT 10", flush=True) report_file = await self._run_nv_internal_eval( data_point, trajectory_dict["model_patch"], instance_dataset_path, ) elif "R2E-Gym" in data_point["dataset_name"]: + print("HIT 11", flush=True) report_file = await self._run_r2e_gym_eval( pred_mounted_path, data_point, @@ -989,6 +1000,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): instance_dataset_path, ) else: + print("HIT 12", flush=True) report_file = await self._run_swebench_eval( pred_mounted_path, data_point, @@ -997,6 +1009,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): ) evaluation_time = asyncio.get_running_loop().time() - start_time except ValueError: + print("HIT 13", flush=True) print( f"Failed to execute SWE-bench evaluation command for {data_point['instance_id']}", flush=True, @@ -1012,10 +1025,13 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): } report_file = None + print("HIT 14", flush=True) if report_file is not None: + print("HIT 15", flush=True) with open(report_file, "r") as f: report_json = json.loads(f.read().strip()) + print("HIT 16", flush=True) output_dict = { "swe-bench-metrics": report_json[data_point["instance_id"]], "swe-bench-outputs": trajectory_dict, From 3b911a77745f1d81d2a4ff22e9a58aa9978e1041 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 16:59:48 -0800 Subject: [PATCH 018/127] print hit Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index b8cd5604f..88ba0d368 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -685,6 +685,7 @@ async def run_swebench_evaluation( dataset_path=dataset_path, ) + print("HIT before process_single_datapoint", problem_info, flush=True) result = await run_oh.process_single_datapoint(problem_info) print(f"Process completed for {instance_id}", flush=True) From 6ba18ad5fde9403760fc964dc33eecd6ebf1dd13 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 17:04:21 -0800 Subject: [PATCH 019/127] clean Signed-off-by: Brian Yu --- .../swe_agents/run_openhands.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index bcf677bec..16e30ec4f 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -909,7 +909,6 @@ def check_tests_passed( return required_tests <= passed_tests async def process_single_datapoint(self, data_point: dict[str, Any]): - print("HIT 1", flush=True) self.output_dir = Path(self.cfg.output_file).parent agent_run_id = f"{data_point['instance_id']}_{int(time.time())}_{str(uuid.uuid4())[:8]}" @@ -922,14 +921,12 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): trajectory_dict = None try: if self.cfg.agent_framework == SupportedAgentFrameworks.swe_agent: - print("HIT 2", flush=True) pred_file = await self._run_swe_agent( data_point, api_base, instance_dataset_path, ) elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands: - print("HIT 3", flush=True) pred_file = await self._run_openhands( data_point, api_base, @@ -937,17 +934,14 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): instance_dataset_path, ) else: - print("HIT 4", flush=True) raise ValueError( f"Unsupported agent framework: {self.cfg.agent_framework}. " f"Supported frameworks: {', '.join(SupportedAgentFrameworks)}." ) - print("HIT 5", flush=True) generation_time = asyncio.get_running_loop().time() - start_time if pred_file is None: - print("HIT 6", flush=True) report_json = { data_point["instance_id"]: { "resolved": False, @@ -958,7 +952,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): } } else: - print("HIT 7", flush=True) pred_mounted_path = pred_file.replace(str(self.output_dir), "/trajectories_mount") with open(pred_file, "r") as f: trajectory_dict = json.loads(f.read()) @@ -967,7 +960,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): has_patch = trajectory_dict["model_patch"] is not None if not has_patch: - print("HIT 8", flush=True) report_json = { data_point["instance_id"]: { "resolved": False, @@ -979,20 +971,17 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): } else: - print("HIT 9", flush=True) # Run full evaluation with streaming output # TODO: should we fail on errors here? Seems that json isn't always generated try: start_time = asyncio.get_running_loop().time() if data_point["dataset_name"] == "nv-internal-1": - print("HIT 10", flush=True) report_file = await self._run_nv_internal_eval( data_point, trajectory_dict["model_patch"], instance_dataset_path, ) elif "R2E-Gym" in data_point["dataset_name"]: - print("HIT 11", flush=True) report_file = await self._run_r2e_gym_eval( pred_mounted_path, data_point, @@ -1000,7 +989,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): instance_dataset_path, ) else: - print("HIT 12", flush=True) report_file = await self._run_swebench_eval( pred_mounted_path, data_point, @@ -1009,7 +997,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): ) evaluation_time = asyncio.get_running_loop().time() - start_time except ValueError: - print("HIT 13", flush=True) print( f"Failed to execute SWE-bench evaluation command for {data_point['instance_id']}", flush=True, @@ -1025,13 +1012,10 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): } report_file = None - print("HIT 14", flush=True) if report_file is not None: - print("HIT 15", flush=True) with open(report_file, "r") as f: report_json = json.loads(f.read().strip()) - print("HIT 16", flush=True) output_dict = { "swe-bench-metrics": report_json[data_point["instance_id"]], "swe-bench-outputs": trajectory_dict, @@ -1041,10 +1025,5 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): } return output_dict - except: - # TODO remove - import traceback - - print("Hit exception in process_single_datapoint", traceback.format_exc(), flush=True) finally: self._cleanup_instance_dataset(instance_dataset_path) From 3726bba79ff4ab5a7cbc926e7c05eb44f2eb82cc Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 17:14:34 -0800 Subject: [PATCH 020/127] clean Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 10 ---------- responses_api_agents/swe_agents/utils.py | 1 - 2 files changed, 11 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 8bb414331..99ceac029 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -280,11 +280,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() ) except Exception as e: - # TODO remove - import traceback - - print(traceback.format_exc(), flush=True) - print(f"SWE-bench evaluation failed: {str(e)}", flush=True) # Return error response error_message = NeMoGymResponseOutputMessage( @@ -306,11 +301,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() tools=[], metadata={"error": str(e)}, ) - except: - # TODO remove - import traceback - - print(traceback.format_exc(), flush=True) async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: """Run and verify SWE-bench solution.""" diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 88ba0d368..b8cd5604f 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -685,7 +685,6 @@ async def run_swebench_evaluation( dataset_path=dataset_path, ) - print("HIT before process_single_datapoint", problem_info, flush=True) result = await run_oh.process_single_datapoint(problem_info) print(f"Process completed for {instance_id}", flush=True) From 7ae04fa9fa17663e70177c553e7f3c399ba18749 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 17:21:16 -0800 Subject: [PATCH 021/127] clean Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 99ceac029..a1e1557aa 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -212,9 +212,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() future = runner_ray_remote.remote(run_swebench_evaluation, params) result = await future - # TODO remove - print("RESULT", result, flush=True) - # Extract trajectory and convert to proper NeMoGym format output_items = [] trajectory = result.get("trajectory", []) From 78cce58d2f7d2302542b0fe8824d989dc73ff6b0 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Wed, 21 Jan 2026 18:03:26 -0800 Subject: [PATCH 022/127] feat: oh metrics block commands Signed-off-by: Sugam Devare --- responses_api_agents/swe_agents/app.py | 5 +++++ .../swe_agents/configs/swebench_openhands.yaml | 2 +- .../swe_agents/configs/swebench_openhands_training.yaml | 4 ++-- responses_api_agents/swe_agents/run_openhands.py | 3 ++- responses_api_agents/swe_agents/utils.py | 6 ++++-- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index a1e1557aa..1c0c9b046 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -60,6 +60,8 @@ }, ) def runner_ray_remote(runner: Callable, params: dict[str, Any]) -> Any: + ray_submit_time = time.time() + params["ray_submit_time"] = ray_submit_time return asyncio.run(runner(**params)) @@ -173,6 +175,7 @@ def model_post_init(self, __context: Any) -> None: print("Dependencies repositories set up complete", flush=True) self.config.run_session_id = f"{int(time.time() * 1000)}_{str(uuid.uuid4())[:8]}" + print(f"Run session ID: {self.config.run_session_id}", flush=True) async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()) -> NeMoGymResponse: # Extract problem information from request @@ -189,6 +192,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() f"{problem_info.get('instance_id', 'unknown')}_{int(time.time() * 1000)}_{str(uuid.uuid4())[:8]}" ) try: + ray_queue_time = time.time() params = { "problem_info": problem_info, "model_endpoint": model_endpoint, @@ -207,6 +211,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "r2e_gym_setup_dir": self.config.r2e_gym_setup_dir, "dataset_path": self.config.dataset_path, "instance_dir": instance_dir, + "ray_queue_time": ray_queue_time, } future = runner_ray_remote.remote(run_swebench_evaluation, params) diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml index 5e29b3968..eb4a57583 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml @@ -9,7 +9,7 @@ swe_agents: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 7af10584eb623e6d50a616d3c3c967d7d4fb3690 # pragma: allowlist secret + agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 # Container configuration container_formatter: ??? diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml index 7e6eacda0..d39fac5e1 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml @@ -8,7 +8,7 @@ swe_agents_train: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 7af10584eb623e6d50a616d3c3c967d7d4fb3690 # pragma: allowlist secret + agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 # Container configuration container_formatter: ??? container_folder_path: null @@ -39,7 +39,7 @@ swe_agents_val: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 7af10584eb623e6d50a616d3c3c967d7d4fb3690 # pragma: allowlist secret + agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 # Container configuration container_formatter: ??? container_folder_path: null diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 16e30ec4f..1eb4e3313 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -353,6 +353,7 @@ async def _run_openhands( "model_name_or_path": out_dict["metadata"]["llm_config"]["model"], "instance_id": out_dict["instance_id"], "model_patch": patch + "\n" if patch and not patch.endswith("\n") else patch, + "oh_time_metrics": out_dict["metrics"], } ) ) @@ -1018,7 +1019,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): output_dict = { "swe-bench-metrics": report_json[data_point["instance_id"]], - "swe-bench-outputs": trajectory_dict, + "oh_time_metrics": trajectory_dict.get("oh_time_metrics", None) if trajectory_dict else {}, "generation": "", # required TODO: we should fix this "generation_time": generation_time, "evaluation_time": evaluation_time, diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 376b4f153..e60df2ea9 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -639,6 +639,8 @@ async def run_swebench_evaluation( r2e_gym_setup_dir: Optional[Path] = None, dataset_path: Optional[str] = None, instance_dir: Optional[str] = None, + ray_queue_time: Optional[float] = None, + ray_submit_time: Optional[float] = None, ) -> Dict: # Create persistent directory for I/O and logs in local workspace workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) @@ -687,6 +689,8 @@ async def run_swebench_evaluation( result = await run_oh.process_single_datapoint(problem_info) print(f"Process completed for {instance_id}", flush=True) + result["oh_time_metrics"]["ray_time_in_queue"] = ray_submit_time - ray_queue_time + try: with open(output_file, "w") as f: json.dump(result, f) @@ -707,8 +711,6 @@ async def run_swebench_evaluation( agent_tools_file if agent_framework == "swe_agent" else None, ) - # tools = convert_tools_to_function_format(tools) if tools else [] - result["tools"] = tools result["trajectory"] = trajectory_data From 2dac87105bbd5593e9bccbd42578e541539e1519 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 22:12:07 -0800 Subject: [PATCH 023/127] try cpus 0.5; print num containers in parallel Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 34 +++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 1c0c9b046..87aed81ee 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -53,16 +53,40 @@ ) +@ray.remote +class ConcurrentContainerCounter: + def __init__(self): + self.concurrent_containers = 0 + + def increment(self): + self.counter += 1 + return self.counter + + def decrement(self): + self.counter += 1 + return self.counter + + @ray.remote( scheduling_strategy="SPREAD", runtime_env={ "py_executable": sys.executable, }, + num_cpus=0.5, ) -def runner_ray_remote(runner: Callable, params: dict[str, Any]) -> Any: +def runner_ray_remote( + concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any] +) -> Any: + concurrent_containers = concurrent_container_counter.increment.remote() + print(f"Concurrent container #{concurrent_containers}", flush=True) + ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time - return asyncio.run(runner(**params)) + result = asyncio.run(runner(**params)) + + concurrent_container_counter.decrement.remote() + + return result class SWEBenchWrapperConfig(BaseResponsesAPIAgentConfig): @@ -162,6 +186,7 @@ class SWEBenchWrapper(SimpleResponsesAPIAgent): def model_post_init(self, __context: Any) -> None: self.sem = Semaphore(self.config.concurrency) + self.container_counter = ConcurrentContainerCounter.remote() # Pre-build OpenHands environment if using openhands framework if self.config.agent_framework == "openhands": @@ -214,7 +239,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "ray_queue_time": ray_queue_time, } - future = runner_ray_remote.remote(run_swebench_evaluation, params) + future = runner_ray_remote.remote(self.container_counter, run_swebench_evaluation, params) result = await future # Extract trajectory and convert to proper NeMoGym format @@ -307,6 +332,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: """Run and verify SWE-bench solution.""" async with self.sem: + print(f"Semaphore: {self.config.concurrency - self.sem._value} / {self.config.concurrency}", flush=True) + body.responses_create_params.metadata["container_concurrency"] = self.config.concurrency - self.sem._value + # Fix None values in responses_create_params to use defaults # This is needed because the pydantic model has non-Optional fields with defaults From be3f4c284dd8ce00825f22b38e48ed1f937b41db Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 22:15:19 -0800 Subject: [PATCH 024/127] try add container counter param Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 87aed81ee..dbbd1f2c0 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -182,6 +182,7 @@ class SWEBenchWrapper(SimpleResponsesAPIAgent): config: SWEBenchWrapperConfig sem: Semaphore = None + container_counter: ConcurrentContainerCounter = None model_config = ConfigDict(arbitrary_types_allowed=True) def model_post_init(self, __context: Any) -> None: From 98efc9f6fb489477bf8ae7cc5ccce26c52500899 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Wed, 21 Jan 2026 22:17:21 -0800 Subject: [PATCH 025/127] use private Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index dbbd1f2c0..d78323263 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -182,12 +182,12 @@ class SWEBenchWrapper(SimpleResponsesAPIAgent): config: SWEBenchWrapperConfig sem: Semaphore = None - container_counter: ConcurrentContainerCounter = None + _container_counter: ConcurrentContainerCounter = None model_config = ConfigDict(arbitrary_types_allowed=True) def model_post_init(self, __context: Any) -> None: self.sem = Semaphore(self.config.concurrency) - self.container_counter = ConcurrentContainerCounter.remote() + self._container_counter = ConcurrentContainerCounter.remote() # Pre-build OpenHands environment if using openhands framework if self.config.agent_framework == "openhands": @@ -240,7 +240,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "ray_queue_time": ray_queue_time, } - future = runner_ray_remote.remote(self.container_counter, run_swebench_evaluation, params) + future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params) result = await future # Extract trajectory and convert to proper NeMoGym format From 1709e9ed030fcd52dcf1bfa9228beafc03585230 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 09:03:44 -0800 Subject: [PATCH 026/127] ray get Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index d78323263..c273cd3d2 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -77,14 +77,14 @@ def decrement(self): def runner_ray_remote( concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any] ) -> Any: - concurrent_containers = concurrent_container_counter.increment.remote() + concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) print(f"Concurrent container #{concurrent_containers}", flush=True) ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time result = asyncio.run(runner(**params)) - concurrent_container_counter.decrement.remote() + ray.get(concurrent_container_counter.decrement.remote()) return result From a4ed09d9beb451822218a1d2382a793586cc852d Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 09:14:23 -0800 Subject: [PATCH 027/127] fix Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index c273cd3d2..cf79738dc 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -59,12 +59,12 @@ def __init__(self): self.concurrent_containers = 0 def increment(self): - self.counter += 1 - return self.counter + self.concurrent_containers += 1 + return self.concurrent_containers def decrement(self): - self.counter += 1 - return self.counter + self.concurrent_containers -= 1 + return self.concurrent_containers @ray.remote( From f912c54bf0dfb6676805643368843dbe20c8b002 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 12:16:22 -0800 Subject: [PATCH 028/127] print usage Signed-off-by: Brian Yu --- responses_api_models/vllm_model/app.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 46319303d..16d13edb1 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -251,6 +251,9 @@ async def chat_completions( else: raise e + # TODO remove + print(chat_completion_dict["usage"]) + choice_dict = chat_completion_dict["choices"][0] if self.config.uses_reasoning_parser: reasoning_content = choice_dict["message"].get("reasoning_content") From 5ffe4487adc95c36a1fdd5ac22822b9a4091bf2d Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 12:18:59 -0800 Subject: [PATCH 029/127] flush Signed-off-by: Brian Yu --- responses_api_models/vllm_model/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 16d13edb1..30bb7a76f 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -252,7 +252,7 @@ async def chat_completions( raise e # TODO remove - print(chat_completion_dict["usage"]) + print(chat_completion_dict["usage"], flush=True) choice_dict = chat_completion_dict["choices"][0] if self.config.uses_reasoning_parser: From f9450a20ab5e745659c870553758e9bd97e90b0c Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 12:28:15 -0800 Subject: [PATCH 030/127] try disable Signed-off-by: Brian Yu --- nemo_gym/rollout_collection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_gym/rollout_collection.py b/nemo_gym/rollout_collection.py index 8d2db4556..4c11e81f3 100644 --- a/nemo_gym/rollout_collection.py +++ b/nemo_gym/rollout_collection.py @@ -150,8 +150,9 @@ async def _post_subroutine(row: Dict) -> Tuple[Dict, Dict]: await raise_for_status(res) return row, await get_response_json(res) + # TODO revert disable=True return tqdm.as_completed( - map(_post_subroutine, examples), desc="Collecting rollouts", miniters=10, total=len(examples) + map(_post_subroutine, examples), desc="Collecting rollouts", miniters=10, total=len(examples), disable=True ) def setup_server_client(self, head_server_config: Optional[BaseServerConfig] = None) -> ServerClient: From 8575e3681e4ed75fd4aafb13eda4e0a233c7cda5 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 12:43:55 -0800 Subject: [PATCH 031/127] revert prefix server logs Signed-off-by: Brian Yu --- nemo_gym/server_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index 58ae1337f..99d1c21b9 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -576,7 +576,8 @@ def run_webserver(cls) -> FastAPI: # pragma: no cover app = server.setup_webserver() server.set_ulimit() - server.prefix_server_logs() + # TODO remove + # server.prefix_server_logs() server.setup_exception_middleware(app) @app.exception_handler(RequestValidationError) From 5f7079d3f9251e0ac100592cd1ff4d1189b4fd68 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:05:26 -0800 Subject: [PATCH 032/127] clean Signed-off-by: Brian Yu --- nemo_gym/server_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index 99d1c21b9..58ae1337f 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -576,8 +576,7 @@ def run_webserver(cls) -> FastAPI: # pragma: no cover app = server.setup_webserver() server.set_ulimit() - # TODO remove - # server.prefix_server_logs() + server.prefix_server_logs() server.setup_exception_middleware(app) @app.exception_handler(RequestValidationError) From 5403fbc5c41b8bdc86f4369f06d98b0735164616 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:05:49 -0800 Subject: [PATCH 033/127] clean Signed-off-by: Brian Yu --- nemo_gym/rollout_collection.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo_gym/rollout_collection.py b/nemo_gym/rollout_collection.py index 4c11e81f3..8d2db4556 100644 --- a/nemo_gym/rollout_collection.py +++ b/nemo_gym/rollout_collection.py @@ -150,9 +150,8 @@ async def _post_subroutine(row: Dict) -> Tuple[Dict, Dict]: await raise_for_status(res) return row, await get_response_json(res) - # TODO revert disable=True return tqdm.as_completed( - map(_post_subroutine, examples), desc="Collecting rollouts", miniters=10, total=len(examples), disable=True + map(_post_subroutine, examples), desc="Collecting rollouts", miniters=10, total=len(examples) ) def setup_server_client(self, head_server_config: Optional[BaseServerConfig] = None) -> ServerClient: From 8fc2ebd6f9b62eb4e20f6bbd4683d9785584007c Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:06:18 -0800 Subject: [PATCH 034/127] clean Signed-off-by: Brian Yu --- responses_api_models/vllm_model/app.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 30bb7a76f..46319303d 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -251,9 +251,6 @@ async def chat_completions( else: raise e - # TODO remove - print(chat_completion_dict["usage"], flush=True) - choice_dict = chat_completion_dict["choices"][0] if self.config.uses_reasoning_parser: reasoning_content = choice_dict["message"].get("reasoning_content") From ba6153c4b215b4d3499fdedc12217654cbb5b860 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:10:50 -0800 Subject: [PATCH 035/127] try logger warning Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index cf79738dc..122a31b7c 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -78,7 +78,10 @@ def runner_ray_remote( concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any] ) -> Any: concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) - print(f"Concurrent container #{concurrent_containers}", flush=True) + + from logging import getLogger + + getLogger().warning(f"Concurrent container #{concurrent_containers}") ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time From f56c36f196620b8e49476312b785d0c54ac77fc5 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:12:31 -0800 Subject: [PATCH 036/127] print usage again Signed-off-by: Brian Yu --- responses_api_models/vllm_model/app.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 46319303d..30bb7a76f 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -251,6 +251,9 @@ async def chat_completions( else: raise e + # TODO remove + print(chat_completion_dict["usage"], flush=True) + choice_dict = chat_completion_dict["choices"][0] if self.config.uses_reasoning_parser: reasoning_content = choice_dict["message"].get("reasoning_content") From 49c7a9074895e6da48fe1acd2b48e7fd91d539ab Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:16:27 -0800 Subject: [PATCH 037/127] try info Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 122a31b7c..cd1da0074 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -81,7 +81,7 @@ def runner_ray_remote( from logging import getLogger - getLogger().warning(f"Concurrent container #{concurrent_containers}") + getLogger().info(f"Concurrent container #{concurrent_containers}") ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time From d1bda2def99fcc0794c4e3fc3f437a6eff6a6ff5 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:25:56 -0800 Subject: [PATCH 038/127] try info into warning Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index cd1da0074..65a3bfc5d 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -81,7 +81,8 @@ def runner_ray_remote( from logging import getLogger - getLogger().info(f"Concurrent container #{concurrent_containers}") + getLogger().info(f"Concurrent container #{concurrent_containers} info") + getLogger().warning(f"Concurrent container #{concurrent_containers} warning") ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time From dbd49ded9e418c9b63167cc3035beca7ec9fb905 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:37:20 -0800 Subject: [PATCH 039/127] try redirect stdout Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 65a3bfc5d..d1e34ca9b 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -409,4 +409,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: if __name__ == "__main__": - SWEBenchWrapper.run_webserver() + import sys + from contextlib import redirect_stdout + + with redirect_stdout(sys.stderr): + SWEBenchWrapper.run_webserver() From 3025ad62929170d552a3891b55054f2e197ed284 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:38:47 -0800 Subject: [PATCH 040/127] redirect inside too Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index d1e34ca9b..cf8c458b7 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -79,9 +79,13 @@ def runner_ray_remote( ) -> Any: concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) + import sys + from contextlib import redirect_stdout from logging import getLogger - getLogger().info(f"Concurrent container #{concurrent_containers} info") + with redirect_stdout(sys.stderr): + getLogger().info(f"Concurrent container #{concurrent_containers} info") + getLogger().warning(f"Concurrent container #{concurrent_containers} warning") ray_submit_time = time.time() From e82a6af1692a8394f565433ae9f466aabed547c0 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:40:50 -0800 Subject: [PATCH 041/127] set log level Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index cf8c458b7..2c73d808a 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -79,14 +79,13 @@ def runner_ray_remote( ) -> Any: concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) - import sys - from contextlib import redirect_stdout - from logging import getLogger - - with redirect_stdout(sys.stderr): - getLogger().info(f"Concurrent container #{concurrent_containers} info") + from logging import DEBUG, getLogger - getLogger().warning(f"Concurrent container #{concurrent_containers} warning") + # with redirect_stdout(sys.stderr): + logger = getLogger() + logger.setLevel(DEBUG) + logger.info(f"Concurrent container #{concurrent_containers} info") + logger.warning(f"Concurrent container #{concurrent_containers} warning") ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time From ee0af2f64993e2564ae3911d3e30904070fffd33 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:42:09 -0800 Subject: [PATCH 042/127] try redirect Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 2c73d808a..7152bc4a8 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -79,12 +79,14 @@ def runner_ray_remote( ) -> Any: concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) + import sys + from contextlib import redirect_stdout from logging import DEBUG, getLogger - # with redirect_stdout(sys.stderr): logger = getLogger() logger.setLevel(DEBUG) - logger.info(f"Concurrent container #{concurrent_containers} info") + with redirect_stdout(sys.stderr): + logger.info(f"Concurrent container #{concurrent_containers} info") logger.warning(f"Concurrent container #{concurrent_containers} warning") ray_submit_time = time.time() From 843b0cb0984c9d7efec48dcd6e0cef1373092f7e Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:46:29 -0800 Subject: [PATCH 043/127] try print with file Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 7152bc4a8..fdc060bb6 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -80,14 +80,11 @@ def runner_ray_remote( concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) import sys - from contextlib import redirect_stdout - from logging import DEBUG, getLogger + from logging import getLogger - logger = getLogger() - logger.setLevel(DEBUG) - with redirect_stdout(sys.stderr): - logger.info(f"Concurrent container #{concurrent_containers} info") - logger.warning(f"Concurrent container #{concurrent_containers} warning") + print(f"Concurrent container #{concurrent_containers} print file sys.stderr", file=sys.stderr) + print(f"Concurrent container #{concurrent_containers} print file default") + getLogger().warning(f"Concurrent container #{concurrent_containers} warning") ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time @@ -414,8 +411,4 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: if __name__ == "__main__": - import sys - from contextlib import redirect_stdout - - with redirect_stdout(sys.stderr): - SWEBenchWrapper.run_webserver() + SWEBenchWrapper.run_webserver() From 7ef8b0791c416d2eaa13acb2e663a2149a418463 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:47:43 -0800 Subject: [PATCH 044/127] try print again Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index fdc060bb6..0a8d99f03 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -80,10 +80,13 @@ def runner_ray_remote( concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) import sys + from contextlib import redirect_stdout from logging import getLogger print(f"Concurrent container #{concurrent_containers} print file sys.stderr", file=sys.stderr) print(f"Concurrent container #{concurrent_containers} print file default") + with redirect_stdout(sys.stderr): + print(f"Concurrent container #{concurrent_containers} print file redirect stdout to stderr") getLogger().warning(f"Concurrent container #{concurrent_containers} warning") ray_submit_time = time.time() From eb9baffcd317ad95d5310a4b1c865fbd20176ce1 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 13:56:21 -0800 Subject: [PATCH 045/127] wrap entire call Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 0a8d99f03..cd9443378 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -17,6 +17,7 @@ import time import uuid from asyncio import Semaphore +from contextlib import redirect_stdout from pathlib import Path from typing import Any, Callable, Dict, Optional @@ -79,19 +80,12 @@ def runner_ray_remote( ) -> Any: concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) - import sys - from contextlib import redirect_stdout - from logging import getLogger - - print(f"Concurrent container #{concurrent_containers} print file sys.stderr", file=sys.stderr) - print(f"Concurrent container #{concurrent_containers} print file default") - with redirect_stdout(sys.stderr): - print(f"Concurrent container #{concurrent_containers} print file redirect stdout to stderr") - getLogger().warning(f"Concurrent container #{concurrent_containers} warning") - ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time - result = asyncio.run(runner(**params)) + + with redirect_stdout(sys.stderr): + print(f"Concurrent container #{concurrent_containers}") + result = asyncio.run(runner(**params)) ray.get(concurrent_container_counter.decrement.remote()) From 516b66fcb78aae49ebea26c6f18073490d6081ed Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 14:00:25 -0800 Subject: [PATCH 046/127] just use std err Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index cd9443378..8338f628f 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -17,7 +17,6 @@ import time import uuid from asyncio import Semaphore -from contextlib import redirect_stdout from pathlib import Path from typing import Any, Callable, Dict, Optional @@ -79,13 +78,12 @@ def runner_ray_remote( concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any] ) -> Any: concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) + print(f"Concurrent container #{concurrent_containers}", file=sys.stderr) ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time - with redirect_stdout(sys.stderr): - print(f"Concurrent container #{concurrent_containers}") - result = asyncio.run(runner(**params)) + result = asyncio.run(runner(**params)) ray.get(concurrent_container_counter.decrement.remote()) From 5a0c1a2953937669b51ff813945d6704c9dd48f1 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 16:31:08 -0800 Subject: [PATCH 047/127] try impl dump graph Signed-off-by: Brian Yu --- nemo_gym/server_utils.py | 17 +++++++++++++++-- pyproject.toml | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index 58ae1337f..e942cb489 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -48,8 +48,10 @@ from fastapi.exception_handlers import request_validation_exception_handler from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse +from gprof2dot import main as gprof2dot_main from omegaconf import DictConfig, OmegaConf, open_dict from pydantic import BaseModel, ConfigDict +from pydot import graph_from_dot_file from requests.exceptions import ConnectionError from starlette.middleware.sessions import SessionMiddleware @@ -452,14 +454,25 @@ async def exception_handling_middleware(request: Request, call_next): return JSONResponse(content="An unknown error occurred", status_code=500) def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareConfig) -> None: # pragma: no cover - base_profile_dir = PARENT_DIR / profiling_config.profiling_results_dirpath - server_profile_path = (base_profile_dir / self.get_session_middleware_key()).with_suffix(".log") + base_profile_dir = PARENT_DIR / profiling_config.profiling_results_dirpath / self.get_session_middleware_key() + server_profile_path = base_profile_dir / "yappi.log" + callgrind_path = base_profile_dir / "yappi.callgrind" + callgrind_dotfile_path = base_profile_dir / "yappi.dot" + callgrind_graph_path = base_profile_dir / "yappi.png" base_profile_dir.mkdir(parents=True, exist_ok=True) main_app_lifespan = app.router.lifespan_context def _dump_yappi_stats() -> str: + yappi.get_func_stats().save(callgrind_path, type="CALLGRIND") + gprof2dot_main( + argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split() + ) + + (graph,) = graph_from_dot_file(callgrind_dotfile_path) + graph.write_png(callgrind_graph_path) + buffer = StringIO() yappi.get_func_stats().print_all( out=buffer, diff --git a/pyproject.toml b/pyproject.toml index 522d4645a..038c338e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -158,6 +158,9 @@ dependencies = [ # Updated: Thu Jan 08, 2026 with orjson==3.11.3 # License: Apache 2.0 https://github.com/ijl/orjson/blob/fb3eb1f729c7e7b019f780af5695722c99c7c695/LICENSE-APACHE "orjson", + + "gprof2dot", + "pydot", ] [dependency-groups] From c029cb6fffa531c911a5c9d4a0b295c5651fba59 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 16:57:44 -0800 Subject: [PATCH 048/127] add print Signed-off-by: Brian Yu --- nemo_gym/server_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index e942cb489..70b577b7d 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -603,6 +603,7 @@ async def validation_exception_handler(request: Request, exc): profiling_config = ProfilingMiddlewareConfig.model_validate(global_config_dict) if profiling_config.profiling_enabled: + print(f"Enabled profiling for {server.config.name}") server.setup_profiling(app, profiling_config) uvicorn_logging_cfg = UvicornLoggingConfig.model_validate(global_config_dict) From d1f72de0005b1aa63b844ccfadd0a4f99c0dcdd6 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 17:16:55 -0800 Subject: [PATCH 049/127] clean Signed-off-by: Brian Yu --- nemo_gym/server_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index 70b577b7d..e942cb489 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -603,7 +603,6 @@ async def validation_exception_handler(request: Request, exc): profiling_config = ProfilingMiddlewareConfig.model_validate(global_config_dict) if profiling_config.profiling_enabled: - print(f"Enabled profiling for {server.config.name}") server.setup_profiling(app, profiling_config) uvicorn_logging_cfg = UvicornLoggingConfig.model_validate(global_config_dict) From cc67ad3418b93a324c45f6efe766d4fb11986901 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 17:19:05 -0800 Subject: [PATCH 050/127] add prints Signed-off-by: Brian Yu --- nemo_gym/server_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index e942cb489..aff9eb36b 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -465,12 +465,17 @@ def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareCon main_app_lifespan = app.router.lifespan_context def _dump_yappi_stats() -> str: + # TODO remove + print("yappi get func stats", file=sys.stderr) yappi.get_func_stats().save(callgrind_path, type="CALLGRIND") + print("gprof2dot_main", file=sys.stderr) gprof2dot_main( argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split() ) + print("graph_from_dot_file", file=sys.stderr) (graph,) = graph_from_dot_file(callgrind_dotfile_path) + print("graph.write_png", file=sys.stderr) graph.write_png(callgrind_graph_path) buffer = StringIO() From b085a140929d3482c6ec3d6eb69b1be2853456bc Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 17:31:11 -0800 Subject: [PATCH 051/127] try timeout and kill Signed-off-by: Brian Yu --- nemo_gym/cli.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/nemo_gym/cli.py b/nemo_gym/cli.py index af3558253..ec0f9c8a6 100644 --- a/nemo_gym/cli.py +++ b/nemo_gym/cli.py @@ -25,7 +25,7 @@ from os.path import exists from pathlib import Path from signal import SIGINT -from subprocess import Popen +from subprocess import Popen, TimeoutExpired from threading import Thread from time import sleep, time from typing import Dict, List, Optional, Tuple @@ -343,8 +343,12 @@ def shutdown(self) -> None: process.send_signal(SIGINT) print("Waiting for processes to finish...") - for process in self._processes.values(): - process.wait() + for top_level_path, process in self._processes.items(): + try: + process.wait(timeout=5) + except TimeoutExpired: + print(f"Waiting for process {top_level_path} timed out. Killing process instead.") + process.kill() self._processes = dict() From ea35354c231ad27bc62a6dde2053d4edb0682f31 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 17:35:04 -0800 Subject: [PATCH 052/127] revert Signed-off-by: Brian Yu --- nemo_gym/cli.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/nemo_gym/cli.py b/nemo_gym/cli.py index ec0f9c8a6..af3558253 100644 --- a/nemo_gym/cli.py +++ b/nemo_gym/cli.py @@ -25,7 +25,7 @@ from os.path import exists from pathlib import Path from signal import SIGINT -from subprocess import Popen, TimeoutExpired +from subprocess import Popen from threading import Thread from time import sleep, time from typing import Dict, List, Optional, Tuple @@ -343,12 +343,8 @@ def shutdown(self) -> None: process.send_signal(SIGINT) print("Waiting for processes to finish...") - for top_level_path, process in self._processes.items(): - try: - process.wait(timeout=5) - except TimeoutExpired: - print(f"Waiting for process {top_level_path} timed out. Killing process instead.") - process.kill() + for process in self._processes.values(): + process.wait() self._processes = dict() From 946b8dc5da6c7f6bc71900c6eed9bbbefc684b77 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 17:46:35 -0800 Subject: [PATCH 053/127] clean Signed-off-by: Brian Yu --- nemo_gym/server_utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index aff9eb36b..e942cb489 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -465,17 +465,12 @@ def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareCon main_app_lifespan = app.router.lifespan_context def _dump_yappi_stats() -> str: - # TODO remove - print("yappi get func stats", file=sys.stderr) yappi.get_func_stats().save(callgrind_path, type="CALLGRIND") - print("gprof2dot_main", file=sys.stderr) gprof2dot_main( argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split() ) - print("graph_from_dot_file", file=sys.stderr) (graph,) = graph_from_dot_file(callgrind_dotfile_path) - print("graph.write_png", file=sys.stderr) graph.write_png(callgrind_graph_path) buffer = StringIO() From 1e2dad11e04eb7abcfcaa05f5231f5ca3cbda2ce Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 20:33:48 -0800 Subject: [PATCH 054/127] simplify get global config dict Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index fbaa478f9..bef135502 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -23,6 +23,7 @@ from openai.types.responses.function_tool import FunctionTool +from nemo_gym.global_config import get_global_config_dict from nemo_gym.openai_utils import ( NeMoGymEasyInputMessage, NeMoGymFunctionCallOutput, @@ -33,7 +34,7 @@ NeMoGymResponseOutputMessageForTraining, NeMoGymResponseOutputText, ) -from nemo_gym.server_utils import ServerClient, get_first_server_config_dict +from nemo_gym.server_utils import get_first_server_config_dict from responses_api_agents.swe_agents.run_openhands import ( RunOpenHandsAgent, SupportedAgentFrameworks, @@ -610,7 +611,7 @@ def extract_problem_info( def get_model_endpoint(model_server_name: str) -> str: - global_config_dict = ServerClient.load_from_global_config().global_config_dict + global_config_dict = get_global_config_dict() model_server_config = get_first_server_config_dict( global_config_dict, From 826310f5e1f6a3b5eaf0f6698137acb95dc0016f Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 20:35:16 -0800 Subject: [PATCH 055/127] try fix serialization error Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index bef135502..537824dca 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -399,7 +399,7 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List: parameters=func_def.get("parameters"), strict=func_def.get("strict"), # May be None ) - tools.append(function_tool) + tools.append(function_tool.model_dump()) return tools From f893f096425637133db5861de85cafeab091e0e3 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 20:51:31 -0800 Subject: [PATCH 056/127] try refactor into profiler Signed-off-by: Brian Yu --- nemo_gym/profiling.py | 60 ++++++++++++++++++++++++++++++++++++++++ nemo_gym/server_utils.py | 58 ++++---------------------------------- 2 files changed, 66 insertions(+), 52 deletions(-) create mode 100644 nemo_gym/profiling.py diff --git a/nemo_gym/profiling.py b/nemo_gym/profiling.py new file mode 100644 index 000000000..91b6cb9b3 --- /dev/null +++ b/nemo_gym/profiling.py @@ -0,0 +1,60 @@ +from io import StringIO +from pathlib import Path + +import yappi +from gprof2dot import main as gprof2dot_main +from pydantic import BaseModel +from pydot import graph_from_dot_file + + +class Profiler(BaseModel): + name: str + base_profile_dir: Path + + def start(self) -> None: + yappi.set_clock_type("CPU") + yappi.start() + print(f"🔍 Enabled profiling for {self.name}") + + def stop(self) -> None: + print(f"🛑 Stopping profiler for {self.name}. Check {self.base_profile_dir} for the metrics!") + yappi.stop() + self.dump() + + def dump(self) -> None: + self.base_profile_dir.mkdir(parents=True, exist_ok=True) + log_path = self.base_profile_dir / "yappi.log" + callgrind_path = self.base_profile_dir / "yappi.callgrind" + callgrind_dotfile_path = self.base_profile_dir / "yappi.dot" + callgrind_graph_path = self.base_profile_dir / "yappi.png" + + yappi.get_func_stats().save(callgrind_path, type="CALLGRIND") + gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()) + + (graph,) = graph_from_dot_file(callgrind_dotfile_path) + graph.write_png(callgrind_graph_path) + + buffer = StringIO() + yappi.get_func_stats().print_all( + out=buffer, + columns={ + 0: ("name", 200), + 1: ("ncall", 10), + 2: ("tsub", 8), + 3: ("ttot", 8), + 4: ("tavg", 8), + }, + ) + + buffer.seek(0) + res = "" + past_header = False + for line in buffer: + if not past_header or self.config.entrypoint in line: + res += line + + if line.startswith("name"): + past_header = True + + with open(log_path, "w") as f: + f.write(res) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index e942cb489..a687376dd 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -19,7 +19,6 @@ import sys from abc import abstractmethod from contextlib import asynccontextmanager -from io import StringIO from logging import Filter as LoggingFilter from logging import LogRecord, getLogger from os import environ, getenv @@ -33,7 +32,6 @@ import ray import requests import uvicorn -import yappi from aiohttp import ( ClientResponse, ClientResponseError, @@ -48,10 +46,8 @@ from fastapi.exception_handlers import request_validation_exception_handler from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse -from gprof2dot import main as gprof2dot_main from omegaconf import DictConfig, OmegaConf, open_dict from pydantic import BaseModel, ConfigDict -from pydot import graph_from_dot_file from requests.exceptions import ConnectionError from starlette.middleware.sessions import SessionMiddleware @@ -69,6 +65,7 @@ get_first_server_config_dict, get_global_config_dict, ) +from nemo_gym.profiling import Profiler _GLOBAL_AIOHTTP_CLIENT: Union[None, ClientSession] = None @@ -455,68 +452,25 @@ async def exception_handling_middleware(request: Request, call_next): def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareConfig) -> None: # pragma: no cover base_profile_dir = PARENT_DIR / profiling_config.profiling_results_dirpath / self.get_session_middleware_key() - server_profile_path = base_profile_dir / "yappi.log" - callgrind_path = base_profile_dir / "yappi.callgrind" - callgrind_dotfile_path = base_profile_dir / "yappi.dot" - callgrind_graph_path = base_profile_dir / "yappi.png" - - base_profile_dir.mkdir(parents=True, exist_ok=True) + profiler = Profiler(name=self.config.name, base_profile_dir=base_profile_dir) main_app_lifespan = app.router.lifespan_context - def _dump_yappi_stats() -> str: - yappi.get_func_stats().save(callgrind_path, type="CALLGRIND") - gprof2dot_main( - argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split() - ) - - (graph,) = graph_from_dot_file(callgrind_dotfile_path) - graph.write_png(callgrind_graph_path) - - buffer = StringIO() - yappi.get_func_stats().print_all( - out=buffer, - columns={ - 0: ("name", 200), - 1: ("ncall", 10), - 2: ("tsub", 8), - 3: ("ttot", 8), - 4: ("tavg", 8), - }, - ) - - buffer.seek(0) - res = "" - past_header = False - for line in buffer: - if not past_header or self.config.entrypoint in line: - res += line - - if line.startswith("name"): - past_header = True - - return res - @asynccontextmanager async def lifespan_wrapper(app): - yappi.set_clock_type("CPU") - yappi.start() - print(f"🔍 Enabled profiling for {self.config.name}") + profiler.start() async with main_app_lifespan(app) as maybe_state: yield maybe_state - print(f"🛑 Stopping profiler for {self.config.name}. Check {server_profile_path} for the metrics!") - yappi.stop() - - with open(server_profile_path, "w") as f: - f.write(_dump_yappi_stats()) + profiler.stop() app.router.lifespan_context = lifespan_wrapper @app.get("/stats") def stats(): - return Response(_dump_yappi_stats()) + profiler.dump() + return Response() def set_ulimit(self, target_soft_limit: int = 65535): # pragma: no cover # From https://github.com/vllm-project/vllm/blob/fed8a9b107df3e27d57728c6911c7d308b871477/vllm/utils/__init__.py#L2790 From e504716855a55762dda6a747012adced5b16a725 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 20:58:17 -0800 Subject: [PATCH 057/127] fix Signed-off-by: Brian Yu --- nemo_gym/profiling.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nemo_gym/profiling.py b/nemo_gym/profiling.py index 91b6cb9b3..319e0d11a 100644 --- a/nemo_gym/profiling.py +++ b/nemo_gym/profiling.py @@ -1,5 +1,6 @@ from io import StringIO from pathlib import Path +from typing import Optional import yappi from gprof2dot import main as gprof2dot_main @@ -11,6 +12,9 @@ class Profiler(BaseModel): name: str base_profile_dir: Path + # Used to clean up and filter out unnecessary information in the yappi log + required_str: Optional[str] = None + def start(self) -> None: yappi.set_clock_type("CPU") yappi.start() @@ -50,7 +54,7 @@ def dump(self) -> None: res = "" past_header = False for line in buffer: - if not past_header or self.config.entrypoint in line: + if not past_header or (self.required_str and self.required_str in line): res += line if line.startswith("name"): From b7f27a9da187acfb7a732a6facb2d8984ea3f625 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Thu, 22 Jan 2026 21:27:33 -0800 Subject: [PATCH 058/127] try add profiling to instance Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 17 ++++++++++++++--- responses_api_agents/swe_agents/utils.py | 7 +------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 8338f628f..ce2eb61a2 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -13,6 +13,7 @@ # limitations under the License. import asyncio import json +import os import sys import time import uuid @@ -40,6 +41,7 @@ NeMoGymResponseOutputMessage, NeMoGymResponseOutputText, ) +from nemo_gym.profiling import Profiler from responses_api_agents.swe_agents.utils import ( convert_tools_to_function_format, convert_trajectory_to_output_items, @@ -83,8 +85,14 @@ def runner_ray_remote( ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time + instance_id = params["problem_info"].get("instance_id", "unknown") + profiler = Profiler(name=instance_id, base_profile_dir=params["persistent_dir"] / "profiling") + profiler.start() + result = asyncio.run(runner(**params)) + profiler.stop() + ray.get(concurrent_container_counter.decrement.remote()) return result @@ -214,33 +222,36 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() # Get model endpoint model_endpoint = get_model_endpoint(self.config.model_server.name) - # Run SWE-bench evaluation + # Create persistent directory for I/O and logs in local workspace instance_dir = ( f"{problem_info.get('instance_id', 'unknown')}_{int(time.time() * 1000)}_{str(uuid.uuid4())[:8]}" ) + workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) + persistent_dir = workspace_root / f"swebench_results_{self.config.run_session_id}" / instance_dir + persistent_dir.mkdir(parents=True, exist_ok=True) try: ray_queue_time = time.time() params = { "problem_info": problem_info, "model_endpoint": model_endpoint, "body": body, - "run_session_id": self.config.run_session_id, "agent_framework": self.config.agent_framework, "agent_config": self.config.agent_config, "agent_tools_file": self.config.agent_tools_file, "agent_max_turns": self.config.agent_max_turns, "swebench_tests_timeout": self.config.swebench_tests_timeout, "swebench_agent_timeout": self.config.swebench_agent_timeout, + "persistent_dir": persistent_dir, "agent_framework_repo": self.config.agent_framework_repo, "agent_framework_commit": self.config.agent_framework_commit, "openhands_setup_dir": self.config.openhands_setup_dir, "swebench_setup_dir": self.config.swebench_setup_dir, "r2e_gym_setup_dir": self.config.r2e_gym_setup_dir, "dataset_path": self.config.dataset_path, - "instance_dir": instance_dir, "ray_queue_time": ray_queue_time, } + # Run SWE-bench evaluation future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params) result = await future diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 537824dca..42a24cda9 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -626,28 +626,23 @@ async def run_swebench_evaluation( problem_info: Dict, model_endpoint: str, body: NeMoGymResponseCreateParamsNonStreaming, - run_session_id: str, agent_framework: str, agent_config: Optional[str], agent_tools_file: Optional[str], agent_max_turns: int, swebench_tests_timeout: int, swebench_agent_timeout: int, + persistent_dir: str, agent_framework_repo: Optional[str] = None, agent_framework_commit: str = "HEAD", openhands_setup_dir: Optional[Path] = None, swebench_setup_dir: Optional[Path] = None, r2e_gym_setup_dir: Optional[Path] = None, dataset_path: Optional[str] = None, - instance_dir: Optional[str] = None, ray_queue_time: Optional[float] = None, ray_submit_time: Optional[float] = None, ) -> Dict: - # Create persistent directory for I/O and logs in local workspace - workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) instance_id = problem_info.get("instance_id", "unknown") - persistent_dir = workspace_root / f"swebench_results_{run_session_id}" / instance_dir - persistent_dir.mkdir(parents=True, exist_ok=True) output_file = persistent_dir / "output.jsonl" inference_params = {} From 744567f5140cb40f46249d5e04fd7df86844ab64 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 10:33:57 -0800 Subject: [PATCH 059/127] pip ng profiling dir Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 1eb4e3313..bee5b7933 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -181,6 +181,7 @@ async def _run_openhands( data_point: dict[str, Any], api_base: str, agent_run_id: str, + profiling_dir: str, dataset_mount_path: Optional[str] = None, ): """ @@ -262,6 +263,7 @@ async def _run_openhands( "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && " # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs) "export POETRY_VIRTUALENVS_IN_PROJECT=true && " + f"export NG_PROFILING_DIR={profiling_dir} && " "export POETRY_VIRTUALENVS_CREATE=false && " "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && " # TODO (sugam): fix cryptography issue From 1cbf06335299cfa9e1a840d58b6f4b724d134732 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 10:39:51 -0800 Subject: [PATCH 060/127] pipe Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 3 ++- responses_api_agents/swe_agents/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index bee5b7933..4c8eb6256 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -911,7 +911,7 @@ def check_tests_passed( return required_tests <= passed_tests - async def process_single_datapoint(self, data_point: dict[str, Any]): + async def process_single_datapoint(self, data_point: dict[str, Any], persistent_dir: Path): self.output_dir = Path(self.cfg.output_file).parent agent_run_id = f"{data_point['instance_id']}_{int(time.time())}_{str(uuid.uuid4())[:8]}" @@ -935,6 +935,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]): api_base, agent_run_id, instance_dataset_path, + persistent_dir / "profiling", ) else: raise ValueError( diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 42a24cda9..cc6806791 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -632,7 +632,7 @@ async def run_swebench_evaluation( agent_max_turns: int, swebench_tests_timeout: int, swebench_agent_timeout: int, - persistent_dir: str, + persistent_dir: Path, agent_framework_repo: Optional[str] = None, agent_framework_commit: str = "HEAD", openhands_setup_dir: Optional[Path] = None, @@ -683,7 +683,7 @@ async def run_swebench_evaluation( dataset_path=dataset_path, ) - result = await run_oh.process_single_datapoint(problem_info) + result = await run_oh.process_single_datapoint(problem_info, persistent_dir) print(f"Process completed for {instance_id}", flush=True) result["oh_time_metrics"]["ray_time_in_queue"] = ray_submit_time - ray_queue_time From 6712ab2dc3dba2e438b2993c21694de36e476fea Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 10:43:25 -0800 Subject: [PATCH 061/127] use name Signed-off-by: Brian Yu --- nemo_gym/profiling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo_gym/profiling.py b/nemo_gym/profiling.py index 319e0d11a..dcdc61e51 100644 --- a/nemo_gym/profiling.py +++ b/nemo_gym/profiling.py @@ -27,10 +27,10 @@ def stop(self) -> None: def dump(self) -> None: self.base_profile_dir.mkdir(parents=True, exist_ok=True) - log_path = self.base_profile_dir / "yappi.log" - callgrind_path = self.base_profile_dir / "yappi.callgrind" - callgrind_dotfile_path = self.base_profile_dir / "yappi.dot" - callgrind_graph_path = self.base_profile_dir / "yappi.png" + log_path = self.base_profile_dir / f"{self.name}.log" + callgrind_path = self.base_profile_dir / f"{self.name}.callgrind" + callgrind_dotfile_path = self.base_profile_dir / f"{self.name}.dot" + callgrind_graph_path = self.base_profile_dir / f"{self.name}.png" yappi.get_func_stats().save(callgrind_path, type="CALLGRIND") gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()) From 17a2ce94596b6c2910dc47879ba08a79f9601bf1 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 10:48:45 -0800 Subject: [PATCH 062/127] try switch commits Signed-off-by: Brian Yu --- .../swe_agents/configs/swebench_openhands.yaml | 2 +- .../swe_agents/configs/swebench_openhands_training.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml index eb4a57583..1e9680950 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml @@ -9,7 +9,7 @@ swe_agents: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 + agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce # Container configuration container_formatter: ??? diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml index d39fac5e1..17898aa51 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml @@ -8,7 +8,7 @@ swe_agents_train: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 + agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce # Container configuration container_formatter: ??? container_folder_path: null @@ -39,7 +39,7 @@ swe_agents_val: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 + agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce # Container configuration container_formatter: ??? container_folder_path: null From 24516f1097a311e51a4bfe31f06630987355e15c Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 11:16:48 -0800 Subject: [PATCH 063/127] bump openhands Signed-off-by: Brian Yu --- .../swe_agents/configs/swebench_openhands.yaml | 2 +- .../swe_agents/configs/swebench_openhands_training.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml index 1e9680950..3aa72d780 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml @@ -9,7 +9,7 @@ swe_agents: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce + agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151 # Container configuration container_formatter: ??? diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml index 17898aa51..b30304795 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml @@ -8,7 +8,7 @@ swe_agents_train: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce + agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151 # Container configuration container_formatter: ??? container_folder_path: null @@ -39,7 +39,7 @@ swe_agents_val: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce + agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151 # Container configuration container_formatter: ??? container_folder_path: null From d46d4404e1e4c9b4b1459f1ffb16912d3248d739 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 11:37:31 -0800 Subject: [PATCH 064/127] convert to list Signed-off-by: Brian Yu --- .../swe_agents/run_openhands.py | 36 +++++++------------ 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 4c8eb6256..ebc0eea82 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -521,30 +521,20 @@ async def _execute_container_command( mount_args.append(f"--mount type=bind,src={venv_path},dst=/openhands_setup/OpenHands/.venv,ro") mount_args.append(f"--mount type=bind,src={venv_path},dst={venv_path},ro") - # make everything in OpenHands read-only - mount_args.append( - f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands,dst=/openhands_setup/OpenHands,ro" + mount_args.extend( + [ + # make everything in OpenHands read-only + f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands,dst=/openhands_setup/OpenHands,ro", + f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/.eval_sessions,dst=/openhands_setup/OpenHands/.eval_sessions", + f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/.eval_sessions,dst={self.openhands_setup_dir}/OpenHands/.eval_sessions", + f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/logs,dst=/openhands_setup/OpenHands/logs", + f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/logs,dst={self.openhands_setup_dir}/OpenHands/logs", + f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/evaluation/oh,dst=/openhands_setup/OpenHands/evaluation/oh", + f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/evaluation/oh,dst={self.openhands_setup_dir}/OpenHands/evaluation/oh", + # Data + f"--mount type=bind,src={dataset_path_to_mount},dst=/root/dataset/data.jsonl", + ] ) - mount_args.append( - f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/.eval_sessions,dst=/openhands_setup/OpenHands/.eval_sessions" - ) - mount_args.append( - f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/.eval_sessions,dst={self.openhands_setup_dir}/OpenHands/.eval_sessions" - ) - mount_args.append( - f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/logs,dst=/openhands_setup/OpenHands/logs" - ) - mount_args.append( - f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/logs,dst={self.openhands_setup_dir}/OpenHands/logs" - ) - mount_args.append( - f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/evaluation/oh,dst=/openhands_setup/OpenHands/evaluation/oh" - ) - mount_args.append( - f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/evaluation/oh,dst={self.openhands_setup_dir}/OpenHands/evaluation/oh" - ) - - mount_args.append(f"--mount type=bind,src={dataset_path_to_mount},dst=/root/dataset/data.jsonl") miniforge3_path = Path(self.openhands_setup_dir) / "miniforge3" mount_args.append(f"--mount type=bind,src={miniforge3_path},dst=/openhands_setup/miniforge3,ro") From 53c009c7d26608062a3852037a1169ba5d3630cc Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 11:44:29 -0800 Subject: [PATCH 065/127] try mount profiling dir Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index ebc0eea82..fe3495548 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -337,6 +337,7 @@ async def _run_openhands( max_retries=1, timeout=self.cfg.swebench_agent_timeout + 60, dataset_mount_path=dataset_mount_path, + profiling_dir=profiling_dir, ) with open(out_file, "r") as f: @@ -480,6 +481,7 @@ async def _execute_container_command( max_retries: int = 2, timeout: int = 45 * 60, # 45 minutes dataset_mount_path: Optional[str] = None, + profiling_dir: Optional[str] = None, ): """Execute a command in an Apptainer container with retry logic.""" # Find the container using multiple strategies @@ -536,6 +538,11 @@ async def _execute_container_command( ] ) + if profiling_dir: + mount_args.append( + f"--mount type=bind,src={profiling_dir},dst={profiling_dir}", + ) + miniforge3_path = Path(self.openhands_setup_dir) / "miniforge3" mount_args.append(f"--mount type=bind,src={miniforge3_path},dst=/openhands_setup/miniforge3,ro") mount_args.append(f"--mount type=bind,src={miniforge3_path},dst={miniforge3_path},ro") From 9e9185ed5d701a1e03354aed9de0ef900fffa1f7 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 12:15:56 -0800 Subject: [PATCH 066/127] print mount args Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index fe3495548..dd0f1d4e0 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -590,6 +590,11 @@ async def _execute_container_command( container_commands.append(command) combined_command = " && ".join(container_commands) + # TODO remove + import sys + + print("\n".join(mount_args), file=sys.stderr) + mount_str = " ".join(mount_args) # Launch Apptainer container and execute the command From af13f0443e1e6a6ea8f7455fd0085660603255b9 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 12:19:33 -0800 Subject: [PATCH 067/127] try reuse trajectories dir Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index dd0f1d4e0..8dd46adc8 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -181,7 +181,6 @@ async def _run_openhands( data_point: dict[str, Any], api_base: str, agent_run_id: str, - profiling_dir: str, dataset_mount_path: Optional[str] = None, ): """ @@ -263,7 +262,7 @@ async def _run_openhands( "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && " # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs) "export POETRY_VIRTUALENVS_IN_PROJECT=true && " - f"export NG_PROFILING_DIR={profiling_dir} && " + f"export NG_PROFILING_DIR=/trajectories_mount/profiling && " "export POETRY_VIRTUALENVS_CREATE=false && " "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && " # TODO (sugam): fix cryptography issue @@ -337,7 +336,6 @@ async def _run_openhands( max_retries=1, timeout=self.cfg.swebench_agent_timeout + 60, dataset_mount_path=dataset_mount_path, - profiling_dir=profiling_dir, ) with open(out_file, "r") as f: @@ -481,7 +479,6 @@ async def _execute_container_command( max_retries: int = 2, timeout: int = 45 * 60, # 45 minutes dataset_mount_path: Optional[str] = None, - profiling_dir: Optional[str] = None, ): """Execute a command in an Apptainer container with retry logic.""" # Find the container using multiple strategies @@ -538,11 +535,6 @@ async def _execute_container_command( ] ) - if profiling_dir: - mount_args.append( - f"--mount type=bind,src={profiling_dir},dst={profiling_dir}", - ) - miniforge3_path = Path(self.openhands_setup_dir) / "miniforge3" mount_args.append(f"--mount type=bind,src={miniforge3_path},dst=/openhands_setup/miniforge3,ro") mount_args.append(f"--mount type=bind,src={miniforge3_path},dst={miniforge3_path},ro") @@ -937,7 +929,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any], persistent_ api_base, agent_run_id, instance_dataset_path, - persistent_dir / "profiling", ) else: raise ValueError( From 13f99cc76eccfc05f75831c250c7c4d2f6c17ae2 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 12:22:06 -0800 Subject: [PATCH 068/127] clean Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 8dd46adc8..29b5de831 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -582,11 +582,6 @@ async def _execute_container_command( container_commands.append(command) combined_command = " && ".join(container_commands) - # TODO remove - import sys - - print("\n".join(mount_args), file=sys.stderr) - mount_str = " ".join(mount_args) # Launch Apptainer container and execute the command From b046da5c57531183d84464c645ff9dc5cff580c9 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 13:33:57 -0800 Subject: [PATCH 069/127] actually print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index cc6806791..dbce75f71 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -17,6 +17,7 @@ import os import shutil import subprocess +import sys from contextlib import contextmanager from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -774,7 +775,7 @@ def _run_setup_shell_script( raise RuntimeError("Failed to capture script output") for line in process.stdout: - print(line, end="", flush=True) + print(line, end="", file=sys.stderr) output_lines.append(line) process.wait(timeout=timeout_seconds) From 94ff7db44c2c2b5594649be2e3399d37980c9133 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 13:45:13 -0800 Subject: [PATCH 070/127] bump openhands Signed-off-by: Brian Yu --- .../swe_agents/configs/swebench_openhands.yaml | 2 +- .../swe_agents/configs/swebench_openhands_training.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml index 3aa72d780..5234a1b22 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml @@ -9,7 +9,7 @@ swe_agents: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151 + agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01 # Container configuration container_formatter: ??? diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml index b30304795..e74bf228f 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml @@ -8,7 +8,7 @@ swe_agents_train: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151 + agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01 # Container configuration container_formatter: ??? container_folder_path: null @@ -39,7 +39,7 @@ swe_agents_val: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151 + agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01 # Container configuration container_formatter: ??? container_folder_path: null From ee61b6c7e40be162fbd314f0882ee6c4d385f94e Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 14:02:56 -0800 Subject: [PATCH 071/127] try dict functool to avoid serialization error Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index dbce75f71..417a115fe 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -22,8 +22,6 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple -from openai.types.responses.function_tool import FunctionTool - from nemo_gym.global_config import get_global_config_dict from nemo_gym.openai_utils import ( NeMoGymEasyInputMessage, @@ -392,15 +390,15 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List: # Convert to Response FunctionTool format which is flat if tool.get("type") == "function" and "function" in tool: func_def = tool["function"] - # Create FunctionTool object with flat structure - function_tool = FunctionTool( - type="function", - name=func_def.get("name", ""), - description=func_def.get("description"), - parameters=func_def.get("parameters"), - strict=func_def.get("strict"), # May be None + tools.append( + dict( + type="function", + name=func_def.get("name", ""), + description=func_def.get("description"), + parameters=func_def.get("parameters"), + strict=func_def.get("strict"), # May be None + ) ) - tools.append(function_tool.model_dump()) return tools From c345ccc9a534b8cb8804983d570bf22a19489b9f Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 14:41:36 -0800 Subject: [PATCH 072/127] clean Signed-off-by: Brian Yu --- responses_api_models/vllm_model/app.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 30bb7a76f..46319303d 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -251,9 +251,6 @@ async def chat_completions( else: raise e - # TODO remove - print(chat_completion_dict["usage"], flush=True) - choice_dict = chat_completion_dict["choices"][0] if self.config.uses_reasoning_parser: reasoning_content = choice_dict["message"].get("reasoning_content") From 6606ae4ffb6017545a5294e3f56e006db4fcd298 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 14:44:26 -0800 Subject: [PATCH 073/127] print tool Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 417a115fe..8d46132e3 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -390,6 +390,8 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List: # Convert to Response FunctionTool format which is flat if tool.get("type") == "function" and "function" in tool: func_def = tool["function"] + # TODO remove + print(func_def, file=sys.stderr) tools.append( dict( type="function", From 8a78997064baa059e8694f76356477e2e3139bee Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 14:57:20 -0800 Subject: [PATCH 074/127] just fail Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 8d46132e3..06d244fc7 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -392,6 +392,7 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List: func_def = tool["function"] # TODO remove print(func_def, file=sys.stderr) + 1 / 0 tools.append( dict( type="function", From c1cb16b0bee3f36486dae014f723ef1dc69bcfcc Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 15:02:04 -0800 Subject: [PATCH 075/127] clean Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 06d244fc7..417a115fe 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -390,9 +390,6 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List: # Convert to Response FunctionTool format which is flat if tool.get("type") == "function" and "function" in tool: func_def = tool["function"] - # TODO remove - print(func_def, file=sys.stderr) - 1 / 0 tools.append( dict( type="function", From 6d7172e5400ec904b1974b49f5067427997ce3ce Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 15:05:37 -0800 Subject: [PATCH 076/127] try error on warnings Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index ce2eb61a2..26dc2da48 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -17,6 +17,9 @@ import sys import time import uuid + +# TODO remove if doesn't work +import warnings from asyncio import Semaphore from pathlib import Path from typing import Any, Callable, Dict, Optional @@ -55,6 +58,10 @@ ) +# Set all warnings to be treated as errors +warnings.filterwarnings("error") + + @ray.remote class ConcurrentContainerCounter: def __init__(self): From 22342acff4d9586254874291289cd92159898b67 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 15:10:31 -0800 Subject: [PATCH 077/127] print metadata Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 26dc2da48..baa388657 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -314,6 +314,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() if "swe-bench-metrics" in result: metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"]) + # TODO remove + print(metadata, file=sys.stderr) + return NeMoGymResponse( id=f"swebench-{problem_info.get('instance_id', 'unknown')}", created_at=int(time.time()), From 1b74d0ee5aacb2583d95895df56ad98d04f2cbb1 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 15:16:00 -0800 Subject: [PATCH 078/127] print many newlines Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index baa388657..10a6feffa 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -315,7 +315,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"]) # TODO remove - print(metadata, file=sys.stderr) + print(f"METADATA: {metadata}\n\n\n\n", file=sys.stderr) return NeMoGymResponse( id=f"swebench-{problem_info.get('instance_id', 'unknown')}", From 37312c1b9cdb107cdc90220e591dc7db0e9e8a14 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 15:19:57 -0800 Subject: [PATCH 079/127] dump Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 10a6feffa..9dd26773c 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -316,6 +316,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() # TODO remove print(f"METADATA: {metadata}\n\n\n\n", file=sys.stderr) + with open("temp.json", "w") as f: + json.dump(metadata, f) return NeMoGymResponse( id=f"swebench-{problem_info.get('instance_id', 'unknown')}", From f600cdb2783a0029c8df7762db31b447579a50e4 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Fri, 23 Jan 2026 15:51:48 -0800 Subject: [PATCH 080/127] feat: update oh w/ mem limt and cmd timeout Signed-off-by: Sugam Devare --- responses_api_agents/swe_agents/app.py | 8 ++++++ .../configs/swebench_openhands.yaml | 10 ++++--- .../configs/swebench_openhands_training.yaml | 16 +++++++----- .../swe_agents/run_openhands.py | 26 ++++++++++++++++--- responses_api_agents/swe_agents/utils.py | 4 +++ 5 files changed, 50 insertions(+), 14 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 1c0c9b046..3844f8330 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -94,6 +94,12 @@ class SWEBenchWrapperConfig(BaseResponsesAPIAgentConfig): swebench_agent_timeout: int = Field(default=45 * 60, description="Timeout for running the agent (seconds)") + apptainer_memory_limit_mb: int = Field( + default=32 * 1024, description="Memory limit for the apptainer container (MB)" + ) + + command_exec_timeout: int = Field(default=5 * 60, description="Timeout for executing the command (seconds)") + # Concurrency control concurrency: int = Field(default=256, description="Maximum number of concurrent SWE-bench runs") @@ -212,6 +218,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "dataset_path": self.config.dataset_path, "instance_dir": instance_dir, "ray_queue_time": ray_queue_time, + "apptainer_memory_limit_mb": self.config.apptainer_memory_limit_mb, + "command_exec_timeout": self.config.command_exec_timeout, } future = runner_ray_remote.remote(run_swebench_evaluation, params) diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml index eb4a57583..a7260856e 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml @@ -9,14 +9,16 @@ swe_agents: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 + agent_framework_commit: dfd04f41c9af452a9a230c7378699c6119bcb2db # Container configuration container_formatter: ??? container_folder_path: null - swebench_agent_timeout: 2700 # 45 minutes - swebench_tests_timeout: 1800 - + swebench_agent_timeout: 1800 + swebench_tests_timeout: 900 + apptainer_memory_limit_mb: 32768 + command_exec_timeout: 300 + dataset_path: ??? # Optional model server reference diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml index d39fac5e1..bd7f8abb4 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml @@ -8,12 +8,14 @@ swe_agents_train: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 + agent_framework_commit: dfd04f41c9af452a9a230c7378699c6119bcb2db # Container configuration container_formatter: ??? container_folder_path: null - swebench_agent_timeout: 2700 # 45 minutes - swebench_tests_timeout: 900 # 15 minutes + swebench_agent_timeout: 1800 + swebench_tests_timeout: 900 + apptainer_memory_limit_mb: 32768 + command_exec_timeout: 300 dataset_path: ??? model_server: name: policy_model # openai_model @@ -39,12 +41,14 @@ swe_agents_val: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475 + agent_framework_commit: dfd04f41c9af452a9a230c7378699c6119bcb2db # Container configuration container_formatter: ??? container_folder_path: null - swebench_agent_timeout: 2700 # 45 minutes - swebench_tests_timeout: 1800 # 30 minutes + swebench_agent_timeout: 1800 + swebench_tests_timeout: 900 + apptainer_memory_limit_mb: 32768 + command_exec_timeout: 300 dataset_path: ??? # Optional model server reference model_server: diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 1eb4e3313..7fdaba4af 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -62,6 +62,8 @@ class SweBenchGenerationConfig: agent_max_turns: int = 100 swebench_tests_timeout: int = 30 * 60 swebench_agent_timeout: int = 45 * 60 + apptainer_memory_limit_mb: int = 32 * 1024 + command_exec_timeout: int = 5 * 60 inference: SweBenchInferenceConfig = field(default_factory=SweBenchInferenceConfig) server: dict = field(default_factory=dict) @@ -224,8 +226,10 @@ async def _run_openhands( agent_script_name = f"agent_script_{agent_run_id}.sh" cleanup_commands = ( f"cd /openhands_setup/OpenHands && " - f"mkdir -p /trajectories_mount/trajectories && " - f"cp -r {eval_dir_in_openhands}/*/*/* /trajectories_mount/trajectories/{data_point['instance_id']}/ &&" + f"mkdir -p /trajectories_mount/trajectories/{data_point['instance_id']}/llm_completions/{data_point['instance_id']}/ && " + f"cp {eval_dir_in_openhands}/*/*/*/output.jsonl /trajectories_mount/trajectories/{data_point['instance_id']}/ && " + f"latest=$(ls -t {eval_dir_in_openhands}/*/*/*/llm_completions/*/*.json 2>/dev/null | head -1); " + f'[ -n "$latest" ] && cp "$latest" /trajectories_mount/trajectories/{data_point["instance_id"]}/llm_completions/{data_point["instance_id"]}/ && ' f"rm -rf {eval_dir_in_openhands} && rm -rf {config_file_path}" ) @@ -264,6 +268,8 @@ async def _run_openhands( "export POETRY_VIRTUALENVS_IN_PROJECT=true && " "export POETRY_VIRTUALENVS_CREATE=false && " "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && " + f"export TMUX_MEMORY_LIMIT={self.cfg.apptainer_memory_limit_mb} && " + f"export COMMAND_EXEC_TIMEOUT={self.cfg.command_exec_timeout} && " # TODO (sugam): fix cryptography issue # "override_dir=$(mktemp -d /tmp/cryptography_override.XXXX) && " # # Reinstall cryptography inside the container (via poetry's venv) using a compatible wheel @@ -358,7 +364,7 @@ async def _run_openhands( ) ) except Exception as e: - print(f"oh run_infer.sh output parsing failed: {e}", flush=True) + print(f"Running OpenHands failed: {e}", flush=True) return None return pred_file @@ -595,10 +601,14 @@ async def _execute_container_command( # Launch Apptainer container and execute the command apptainer_cmd = ( - f"apptainer exec --writable-tmpfs --cleanenv --no-mount home,tmp,bind-paths " + f"apptainer exec --writable-tmpfs --cleanenv --pid --no-mount home,tmp,bind-paths " f"{mount_str} " f" {container_name} bash -c {shlex.quote(combined_command)}" ) + memory_limit_mb = self.cfg.apptainer_memory_limit_mb + if memory_limit_mb is not None and memory_limit_mb > 0: + memory_limit_kb = int(memory_limit_mb) * 1024 + apptainer_cmd = f"ulimit -v {memory_limit_kb} && {apptainer_cmd}" # Retry apptainer command up to max_retries times for attempt in range(max_retries): @@ -633,6 +643,14 @@ async def _execute_container_command( if len(pred_files) == 1: return pred_files[0] + elif len(pred_files) > 1: + latest_file = max(pred_files, key=os.path.getmtime) + print( + f"Multiple outputs found for {data_point['instance_id']} " + f"({len(pred_files)}). Using latest: {latest_file}", + flush=True, + ) + return latest_file else: raise ValueError( f"Expected exactly one file matching {expected_file_pattern} for {data_point['instance_id']}, " diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index e60df2ea9..c0572bb59 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -641,6 +641,8 @@ async def run_swebench_evaluation( instance_dir: Optional[str] = None, ray_queue_time: Optional[float] = None, ray_submit_time: Optional[float] = None, + apptainer_memory_limit_mb: Optional[int] = None, + command_exec_timeout: Optional[int] = None, ) -> Dict: # Create persistent directory for I/O and logs in local workspace workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) @@ -675,6 +677,8 @@ async def run_swebench_evaluation( agent_max_turns=agent_max_turns, swebench_tests_timeout=swebench_tests_timeout, swebench_agent_timeout=swebench_agent_timeout, + apptainer_memory_limit_mb=apptainer_memory_limit_mb, + command_exec_timeout=command_exec_timeout, inference=inference_config, server=server, ) From 591b409975d40466f32445ab8e7b71bb1fa7193e Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 15:54:45 -0800 Subject: [PATCH 081/127] error and prints Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 9dd26773c..fb3dfa44a 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -262,6 +262,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params) result = await future + # TODO remove + print("HIT 1", file=sys.stderr) + # Extract trajectory and convert to proper NeMoGym format output_items = [] trajectory = result.get("trajectory", []) @@ -270,6 +273,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() raw_tools = result.get("tools", []) tools = convert_tools_to_function_format(raw_tools) if raw_tools else [] + # TODO remove + print("HIT 2", file=sys.stderr) + # Convert trajectory to NeMoGym output items if trajectory: output_items = convert_trajectory_to_output_items( @@ -277,6 +283,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() self.config.agent_framework, ) + # TODO remove + print("HIT 3", file=sys.stderr) + # If no trajectory or empty output, create a summary message if not output_items: output_items = [ @@ -297,6 +306,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() ) ] + # TODO remove + print("HIT 4", file=sys.stderr) + # Store the full result in metadata for the verify step # Note: metadata values must be strings for NeMoGymResponse metadata = { @@ -310,12 +322,16 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() if key in result: metadata[key] = str(result[key]) + # TODO remove + print("HIT 5", file=sys.stderr) + # For complex metrics, store as JSON string if "swe-bench-metrics" in result: metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"]) # TODO remove print(f"METADATA: {metadata}\n\n\n\n", file=sys.stderr) + 1 / 0 with open("temp.json", "w") as f: json.dump(metadata, f) From 9deffae825e3033b3b2930b2e9f68a3e7a04cf3c Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:01:26 -0800 Subject: [PATCH 082/127] dont error on warnings Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index fb3dfa44a..fb1df1bc8 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -19,7 +19,6 @@ import uuid # TODO remove if doesn't work -import warnings from asyncio import Semaphore from pathlib import Path from typing import Any, Callable, Dict, Optional @@ -58,10 +57,6 @@ ) -# Set all warnings to be treated as errors -warnings.filterwarnings("error") - - @ray.remote class ConcurrentContainerCounter: def __init__(self): From 83b585171d27bf2be59c3e33756b8890b45137e2 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:12:35 -0800 Subject: [PATCH 083/127] clean Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index fb1df1bc8..ce2eb61a2 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -17,8 +17,6 @@ import sys import time import uuid - -# TODO remove if doesn't work from asyncio import Semaphore from pathlib import Path from typing import Any, Callable, Dict, Optional @@ -257,9 +255,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params) result = await future - # TODO remove - print("HIT 1", file=sys.stderr) - # Extract trajectory and convert to proper NeMoGym format output_items = [] trajectory = result.get("trajectory", []) @@ -268,9 +263,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() raw_tools = result.get("tools", []) tools = convert_tools_to_function_format(raw_tools) if raw_tools else [] - # TODO remove - print("HIT 2", file=sys.stderr) - # Convert trajectory to NeMoGym output items if trajectory: output_items = convert_trajectory_to_output_items( @@ -278,9 +270,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() self.config.agent_framework, ) - # TODO remove - print("HIT 3", file=sys.stderr) - # If no trajectory or empty output, create a summary message if not output_items: output_items = [ @@ -301,9 +290,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() ) ] - # TODO remove - print("HIT 4", file=sys.stderr) - # Store the full result in metadata for the verify step # Note: metadata values must be strings for NeMoGymResponse metadata = { @@ -317,19 +303,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() if key in result: metadata[key] = str(result[key]) - # TODO remove - print("HIT 5", file=sys.stderr) - # For complex metrics, store as JSON string if "swe-bench-metrics" in result: metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"]) - # TODO remove - print(f"METADATA: {metadata}\n\n\n\n", file=sys.stderr) - 1 / 0 - with open("temp.json", "w") as f: - json.dump(metadata, f) - return NeMoGymResponse( id=f"swebench-{problem_info.get('instance_id', 'unknown')}", created_at=int(time.time()), From 879c623871a9ceb1702c9d4a183de5703d7019be Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:14:15 -0800 Subject: [PATCH 084/127] ignore pydantic serialization warnings Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index ce2eb61a2..e337b991e 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -17,12 +17,14 @@ import sys import time import uuid +import warnings from asyncio import Semaphore from pathlib import Path from typing import Any, Callable, Dict, Optional import ray from pydantic import ConfigDict, Field +from pydantic_core import PydanticSerializationUnexpectedValue from nemo_gym.base_resources_server import ( BaseRunRequest, @@ -55,6 +57,9 @@ ) +warnings.filterwarnings("ignore", category=PydanticSerializationUnexpectedValue) + + @ray.remote class ConcurrentContainerCounter: def __init__(self): From 01f7a8dc67c77035a45ffd3fa9562124c38f717c Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:18:42 -0800 Subject: [PATCH 085/127] try filter by message Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index e337b991e..0f37d2681 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -24,7 +24,6 @@ import ray from pydantic import ConfigDict, Field -from pydantic_core import PydanticSerializationUnexpectedValue from nemo_gym.base_resources_server import ( BaseRunRequest, @@ -57,7 +56,7 @@ ) -warnings.filterwarnings("ignore", category=PydanticSerializationUnexpectedValue) +warnings.filterwarnings("ignore", message="FunctionTool") @ray.remote From b29b432391f2d953261a0802839602e7743a9c29 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:22:39 -0800 Subject: [PATCH 086/127] revert back to function tool Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 417a115fe..5ef2d72d5 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -22,6 +22,8 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +from openai.types.responses.function_tool import FunctionTool + from nemo_gym.global_config import get_global_config_dict from nemo_gym.openai_utils import ( NeMoGymEasyInputMessage, @@ -390,15 +392,15 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List: # Convert to Response FunctionTool format which is flat if tool.get("type") == "function" and "function" in tool: func_def = tool["function"] - tools.append( - dict( - type="function", - name=func_def.get("name", ""), - description=func_def.get("description"), - parameters=func_def.get("parameters"), - strict=func_def.get("strict"), # May be None - ) + # Create FunctionTool object with flat structure + function_tool = FunctionTool( + type="function", + name=func_def.get("name", ""), + description=func_def.get("description"), + parameters=func_def.get("parameters"), + strict=func_def.get("strict"), # May be None ) + tools.append(function_tool) return tools From 78dc107a75bfbcc0139f81b102afc0d912e3cce8 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:24:03 -0800 Subject: [PATCH 087/127] add comment Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 0f37d2681..a65aa9cb8 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -56,6 +56,8 @@ ) +# There are some mysterious Pydantic serialization warnings related to FunctionTool that are not fatal that clutter up logs. +# At some point we can try continue chasing this one down. warnings.filterwarnings("ignore", message="FunctionTool") From 028a9d7cb60b4aabd8ad4e216a0c9a051c304e69 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:25:12 -0800 Subject: [PATCH 088/127] add example Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index a65aa9cb8..aba41d406 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -57,7 +57,8 @@ # There are some mysterious Pydantic serialization warnings related to FunctionTool that are not fatal that clutter up logs. -# At some point we can try continue chasing this one down. +# At some point we can try continue chasing this one down. Example: +# (NemoGym pid=3160799) (swe_agents_val) PydanticSerializationUnexpectedValue(Expected `general-fields` - serialized value may not be as expected [field_name='tools', input_value=FunctionTool(name='str_re... a single call each.\n'), input_type=FunctionTool]) warnings.filterwarnings("ignore", message="FunctionTool") From 99d1fd571551e2a08332478f3b6af077f3720797 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:34:52 -0800 Subject: [PATCH 089/127] try profiling openhands Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 29b5de831..345adbb14 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -262,7 +262,7 @@ async def _run_openhands( "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && " # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs) "export POETRY_VIRTUALENVS_IN_PROJECT=true && " - f"export NG_PROFILING_DIR=/trajectories_mount/profiling && " + f"export NG_PROFILING_DIR=/trajectories_mount/profiling_openhands && " "export POETRY_VIRTUALENVS_CREATE=false && " "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && " # TODO (sugam): fix cryptography issue From cb4a0406880ec5a188557cdd84ca8fddd959200d Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:50:18 -0800 Subject: [PATCH 090/127] bump Signed-off-by: Brian Yu --- .../swe_agents/configs/swebench_openhands.yaml | 2 +- .../swe_agents/configs/swebench_openhands_training.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml index 5234a1b22..40744020a 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml @@ -9,7 +9,7 @@ swe_agents: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01 + agent_framework_commit: bxyu/profiling # Container configuration container_formatter: ??? diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml index e74bf228f..aa5003562 100644 --- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml +++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml @@ -8,7 +8,7 @@ swe_agents_train: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01 + agent_framework_commit: bxyu/profiling # Container configuration container_formatter: ??? container_folder_path: null @@ -39,7 +39,7 @@ swe_agents_val: agent_config: responses_api_agents/swe_agents/configs/oh_config.toml agent_max_turns: 100 agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git - agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01 + agent_framework_commit: bxyu/profiling # Container configuration container_formatter: ??? container_folder_path: null From 1a763a711e5f150dffffa22ed3e1432da66a664b Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Fri, 23 Jan 2026 16:56:30 -0800 Subject: [PATCH 091/127] enable logging Signed-off-by: Brian Yu --- .../swe_agents/run_openhands.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 345adbb14..2eb315dbc 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -250,14 +250,16 @@ async def _run_openhands( # Use pre-built OpenHands "cd /openhands_setup/OpenHands && " "export RUNTIME=local && " - # "export LOG_LEVEL=DEBUG && " - # "export LOG_TO_FILE=true && " - "export LOG_LEVEL=CRITICAL && " - "export DEBUG=False && " - "export DEBUG_LLM=False && " - "export LOG_TO_FILE=False && " - "export LOG_ALL_EVENTS=False && " - "export DEBUG_RUNTIME=False && " + # Enable these two for debug logging + "export LOG_LEVEL=DEBUG && " + "export LOG_TO_FILE=true && " + # Disable these 5 for logging + # "export LOG_LEVEL=CRITICAL && " + # "export DEBUG=False && " + # "export DEBUG_LLM=False && " + # "export LOG_TO_FILE=False && " + # "export LOG_ALL_EVENTS=False && " + # "export DEBUG_RUNTIME=False && " "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && " "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && " # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs) From 16e8be129370dbf67b27d3247e82657a3ba7a0b1 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Fri, 23 Jan 2026 17:25:03 -0800 Subject: [PATCH 092/127] feat: move copy logic to host Signed-off-by: Sugam Devare --- .../swe_agents/run_openhands.py | 78 +++++++++++++++---- 1 file changed, 63 insertions(+), 15 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 7fdaba4af..c61999018 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -17,6 +17,7 @@ import os import re import shlex +import shutil import time import uuid from dataclasses import dataclass, field @@ -224,14 +225,6 @@ async def _run_openhands( assert self.openhands_setup_dir is not None, "OpenHands setup directory is not set" agent_script_name = f"agent_script_{agent_run_id}.sh" - cleanup_commands = ( - f"cd /openhands_setup/OpenHands && " - f"mkdir -p /trajectories_mount/trajectories/{data_point['instance_id']}/llm_completions/{data_point['instance_id']}/ && " - f"cp {eval_dir_in_openhands}/*/*/*/output.jsonl /trajectories_mount/trajectories/{data_point['instance_id']}/ && " - f"latest=$(ls -t {eval_dir_in_openhands}/*/*/*/llm_completions/*/*.json 2>/dev/null | head -1); " - f'[ -n "$latest" ] && cp "$latest" /trajectories_mount/trajectories/{data_point["instance_id"]}/llm_completions/{data_point["instance_id"]}/ && ' - f"rm -rf {eval_dir_in_openhands} && rm -rf {config_file_path}" - ) agent_main_cmd = ( "if [ -d /workspace ]; then " @@ -318,22 +311,18 @@ async def _run_openhands( agent_timeout_seconds = self.cfg.swebench_agent_timeout openhands_cmd = ( f"timeout --signal=TERM --kill-after=30 {agent_timeout_seconds} " - f"bash /trajectories_mount/{agent_script_name}; " - f"echo 'Cleaning up...'; " - f"{cleanup_commands}" + f"bash /trajectories_mount/{agent_script_name}" ) search_path = os.path.join( - self.output_dir / "trajectories", - "**", - data_point["instance_id"], + self.openhands_setup_dir / "OpenHands" / eval_dir_in_openhands, "**", "output.jsonl", ) try: # Execute OpenHands command - out_file = await self._execute_container_command( + out_file_in_eval = await self._execute_container_command( data_point=data_point, command=openhands_cmd, expected_file_pattern=search_path, @@ -342,6 +331,12 @@ async def _run_openhands( timeout=self.cfg.swebench_agent_timeout + 60, dataset_mount_path=dataset_mount_path, ) + out_file = self._openhands_dir_copy_from_host( + data_point=data_point, + eval_dir_in_openhands=eval_dir_in_openhands, + config_file_path=config_file_path, + output_file_path=out_file_in_eval, + ) with open(out_file, "r") as f: out_dict = json.loads(f.read().strip()) @@ -364,10 +359,63 @@ async def _run_openhands( ) ) except Exception as e: + self._openhands_dir_copy_from_host( + data_point=data_point, + eval_dir_in_openhands=eval_dir_in_openhands, + config_file_path=config_file_path, + output_file_path=None, + ) print(f"Running OpenHands failed: {e}", flush=True) return None return pred_file + def _openhands_dir_copy_from_host( + self, + data_point: dict[str, Any], + eval_dir_in_openhands: str, + config_file_path: str, + output_file_path: Optional[str], + ) -> Optional[str]: + + eval_dir_on_host = Path(self.openhands_setup_dir) / "OpenHands" / eval_dir_in_openhands + trajectories_root = Path(self.output_dir) / "trajectories" / data_point["instance_id"] + llm_completions_dir = trajectories_root / "llm_completions" / data_point["instance_id"] + trajectories_root.mkdir(parents=True, exist_ok=True) + llm_completions_dir.mkdir(parents=True, exist_ok=True) + + dest_output: Optional[str] = None + if output_file_path: + source_output = Path(output_file_path) + if not source_output.is_absolute(): + source_output = eval_dir_on_host / source_output + if not source_output.exists(): + output_candidates = sorted(eval_dir_on_host.glob("*/*/*/output.jsonl"), key=os.path.getmtime) + if not output_candidates: + raise FileNotFoundError( + f"No output.jsonl found under {eval_dir_on_host} for {data_point['instance_id']}." + ) + source_output = output_candidates[-1] + + dest_output_path = trajectories_root / "output.jsonl" + shutil.copy2(source_output, dest_output_path) + dest_output = str(dest_output_path) + + completion_candidates = glob.glob(str(eval_dir_on_host / "*/*/*/llm_completions/*/*.json")) + if completion_candidates: + latest_completion = max(completion_candidates, key=os.path.getmtime) + shutil.copy2( + latest_completion, + llm_completions_dir / Path(latest_completion).name, + ) + + shutil.rmtree(eval_dir_on_host, ignore_errors=True) + try: + Path(config_file_path).unlink() + except OSError: + pass + + return dest_output + def _write_instance_dataset(self, data_point: dict[str, Any], agent_run_id: str) -> Path: """ To avoid making HF dataset API calls, we write the instance dictionary to a file and mount it in the container. From 9699852c538b8385655797a30b7f84ab1a5bd75c Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Sat, 24 Jan 2026 10:43:44 -0800 Subject: [PATCH 093/127] revert to shared folder Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 2eb315dbc..bf0253b52 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -264,7 +264,7 @@ async def _run_openhands( "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && " # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs) "export POETRY_VIRTUALENVS_IN_PROJECT=true && " - f"export NG_PROFILING_DIR=/trajectories_mount/profiling_openhands && " + f"export NG_PROFILING_DIR=/trajectories_mount/profiling && " "export POETRY_VIRTUALENVS_CREATE=false && " "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && " # TODO (sugam): fix cryptography issue From cc43a52f0566db9763affb475c4e3bb6c48d43d2 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Sat, 24 Jan 2026 11:52:20 -0800 Subject: [PATCH 094/127] pipe debug through Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 22 +++++++++------ .../swe_agents/run_openhands.py | 28 +++++++++++-------- responses_api_agents/swe_agents/utils.py | 8 +++++- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index aba41d406..44a5777d0 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -86,21 +86,23 @@ def decrement(self): def runner_ray_remote( concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any] ) -> Any: - concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) - print(f"Concurrent container #{concurrent_containers}", file=sys.stderr) - ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time - instance_id = params["problem_info"].get("instance_id", "unknown") - profiler = Profiler(name=instance_id, base_profile_dir=params["persistent_dir"] / "profiling") - profiler.start() + if params["debug"]: + concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) + print(f"Concurrent container #{concurrent_containers}", file=sys.stderr) + + instance_id = params["problem_info"].get("instance_id", "unknown") + profiler = Profiler(name=instance_id, base_profile_dir=params["persistent_dir"] / "profiling") + profiler.start() result = asyncio.run(runner(**params)) - profiler.stop() + if params["debug"]: + profiler.stop() - ray.get(concurrent_container_counter.decrement.remote()) + ray.get(concurrent_container_counter.decrement.remote()) return result @@ -166,6 +168,8 @@ class SWEBenchWrapperConfig(BaseResponsesAPIAgentConfig): description="Session ID for the run", ) + debug: bool = False + class SWEBenchRunRequest(BaseRunRequest): """Request format for SWE-bench runs.""" @@ -210,6 +214,7 @@ def model_post_init(self, __context: Any) -> None: self.config.openhands_setup_dir = setup_openhands_environment( agent_framework_repo=self.config.agent_framework_repo, agent_framework_commit=self.config.agent_framework_commit, + debug=self.config.debug, ) self.config.swebench_setup_dir = setup_swebench_environment() self.config.r2e_gym_setup_dir = setup_r2e_gym_environment() @@ -256,6 +261,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "r2e_gym_setup_dir": self.config.r2e_gym_setup_dir, "dataset_path": self.config.dataset_path, "ray_queue_time": ray_queue_time, + "debug": self.config.debug, } # Run SWE-bench evaluation diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index bf0253b52..3ce797b42 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -97,6 +97,7 @@ class RunOpenHandsAgent: swebench_setup_dir: Path | None = None r2e_gym_setup_dir: Path | None = None dataset_path: str | None = None + debug: bool = False async def _run_swe_agent(self, data_point, api_base): """ @@ -229,6 +230,20 @@ async def _run_openhands( f"rm -rf {eval_dir_in_openhands} && rm -rf {config_file_path}" ) + if self.debug: + log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && " + profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && " + else: + log_cmd = ( + "export LOG_LEVEL=CRITICAL && " + "export DEBUG=False && " + "export DEBUG_LLM=False && " + "export LOG_TO_FILE=False && " + "export LOG_ALL_EVENTS=False && " + "export DEBUG_RUNTIME=False && " + ) + profiling_cmd = "" + agent_main_cmd = ( "if [ -d /workspace ]; then " " echo 'Exiting because /workspace is mounted.' && " @@ -250,21 +265,12 @@ async def _run_openhands( # Use pre-built OpenHands "cd /openhands_setup/OpenHands && " "export RUNTIME=local && " - # Enable these two for debug logging - "export LOG_LEVEL=DEBUG && " - "export LOG_TO_FILE=true && " - # Disable these 5 for logging - # "export LOG_LEVEL=CRITICAL && " - # "export DEBUG=False && " - # "export DEBUG_LLM=False && " - # "export LOG_TO_FILE=False && " - # "export LOG_ALL_EVENTS=False && " - # "export DEBUG_RUNTIME=False && " + f"{log_cmd}" + f"{profiling_cmd}" "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && " "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && " # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs) "export POETRY_VIRTUALENVS_IN_PROJECT=true && " - f"export NG_PROFILING_DIR=/trajectories_mount/profiling && " "export POETRY_VIRTUALENVS_CREATE=false && " "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && " # TODO (sugam): fix cryptography issue diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 5ef2d72d5..251edf044 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -642,6 +642,7 @@ async def run_swebench_evaluation( dataset_path: Optional[str] = None, ray_queue_time: Optional[float] = None, ray_submit_time: Optional[float] = None, + debug: bool = False, ) -> Dict: instance_id = problem_info.get("instance_id", "unknown") output_file = persistent_dir / "output.jsonl" @@ -682,6 +683,7 @@ async def run_swebench_evaluation( swebench_setup_dir=swebench_setup_dir, r2e_gym_setup_dir=r2e_gym_setup_dir, dataset_path=dataset_path, + debug=debug, ) result = await run_oh.process_single_datapoint(problem_info, persistent_dir) @@ -750,6 +752,7 @@ def _run_setup_shell_script( timeout_seconds: int, label: str, timeout_error_message: Optional[str] = None, + debug: bool = False, ) -> None: script_path = setup_dir / script_name @@ -774,8 +777,9 @@ def _run_setup_shell_script( if process.stdout is None: raise RuntimeError("Failed to capture script output") + target_file = sys.stderr if debug else sys.stdout for line in process.stdout: - print(line, end="", file=sys.stderr) + print(line, end="", file=target_file) output_lines.append(line) process.wait(timeout=timeout_seconds) @@ -1039,6 +1043,7 @@ def setup_openhands_environment( agent_framework_repo: Optional[str] = "https://github.com/sdevare-nv/nv-OpenHands.git", agent_framework_commit: str = "gym", setup_dir: Optional[Path] = None, + debug: bool = False, ) -> Path: setup_dir = _resolve_setup_directory(setup_dir, "swe_openhands_setup") @@ -1207,6 +1212,7 @@ def setup_openhands_environment( timeout_seconds=1800, label="OpenHands", timeout_error_message="OpenHands setup timed out after 30 minutes", + debug=debug, ) print(f"Setup directory: {setup_dir}", flush=True) From 964a8592d5d439899ef0a1a13db824ec59d2e0ba Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Sat, 24 Jan 2026 11:53:00 -0800 Subject: [PATCH 095/127] add apt instapll graphviz Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 3ce797b42..5720dc2c7 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -232,7 +232,9 @@ async def _run_openhands( if self.debug: log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && " - profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && " + profiling_cmd = ( + "export NG_PROFILING_DIR=/trajectories_mount/profiling && apt update && apt install -y graphviz &&" + ) else: log_cmd = ( "export LOG_LEVEL=CRITICAL && " From 8c5f56639811f92e69a76b331f314747471b72a2 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Sat, 24 Jan 2026 12:01:39 -0800 Subject: [PATCH 096/127] try apt get Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 5720dc2c7..43a84710f 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -232,9 +232,7 @@ async def _run_openhands( if self.debug: log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && " - profiling_cmd = ( - "export NG_PROFILING_DIR=/trajectories_mount/profiling && apt update && apt install -y graphviz &&" - ) + profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && apt-get update && apt-get install -y graphviz &&" else: log_cmd = ( "export LOG_LEVEL=CRITICAL && " From 82414d89bdbb176d80d0ff80b14f64a851c74e07 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Sat, 24 Jan 2026 12:07:30 -0800 Subject: [PATCH 097/127] remove apt install Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 43a84710f..3ce797b42 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -232,7 +232,7 @@ async def _run_openhands( if self.debug: log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && " - profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && apt-get update && apt-get install -y graphviz &&" + profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && " else: log_cmd = ( "export LOG_LEVEL=CRITICAL && " From 59f8a4d89072c0152d20affceb17a16e285dfc58 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Sat, 24 Jan 2026 12:11:22 -0800 Subject: [PATCH 098/127] dump out afterwards Signed-off-by: Brian Yu --- .../swe_agents/run_openhands.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 3ce797b42..231cda310 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -25,6 +25,8 @@ from typing import Any, Optional import tomlkit +from gprof2dot import main as gprof2dot_main +from pydot import graph_from_dot_file class SupportedAgentFrameworks(str, Enum): @@ -366,6 +368,21 @@ async def _run_openhands( } ) ) + + # Dump out dot and png files from profiling on OpenHands level + if self.debug: + base_profile_dir = Path(self.output_dir) / "profiling" + profiling_name = "openhands" + callgrind_path = base_profile_dir / f"{profiling_name}.callgrind" + callgrind_dotfile_path = base_profile_dir / f"{profiling_name}.dot" + callgrind_graph_path = base_profile_dir / f"{profiling_name}.png" + + gprof2dot_main( + argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split() + ) + + (graph,) = graph_from_dot_file(callgrind_dotfile_path) + graph.write_png(callgrind_graph_path) except Exception as e: print(f"oh run_infer.sh output parsing failed: {e}", flush=True) return None From a3cf012046c5bf8ace4b950f9e3cd0a55c239f84 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Sun, 25 Jan 2026 21:14:31 -0800 Subject: [PATCH 099/127] bump up pct Signed-off-by: Brian Yu --- nemo_gym/profiling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_gym/profiling.py b/nemo_gym/profiling.py index dcdc61e51..c9eb37572 100644 --- a/nemo_gym/profiling.py +++ b/nemo_gym/profiling.py @@ -33,7 +33,7 @@ def dump(self) -> None: callgrind_graph_path = self.base_profile_dir / f"{self.name}.png" yappi.get_func_stats().save(callgrind_path, type="CALLGRIND") - gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()) + gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 5 -n 5 {callgrind_path}".split()) (graph,) = graph_from_dot_file(callgrind_dotfile_path) graph.write_png(callgrind_graph_path) From 54d95c18d91b8a9506c57f52b99bce433f13b63a Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Sun, 25 Jan 2026 21:16:23 -0800 Subject: [PATCH 100/127] increase to 5 Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/run_openhands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 231cda310..90ed05ee4 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -378,7 +378,7 @@ async def _run_openhands( callgrind_graph_path = base_profile_dir / f"{profiling_name}.png" gprof2dot_main( - argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split() + argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 5 -n 5 {callgrind_path}".split() ) (graph,) = graph_from_dot_file(callgrind_dotfile_path) From a688dd6580a633a3a009a258600c40084df59d48 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 10:18:41 -0800 Subject: [PATCH 101/127] add hits Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 9e6a9de2c..01e341983 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -276,6 +276,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params) result = await future + print("HIT A", file=sys.stderr) + # Extract trajectory and convert to proper NeMoGym format output_items = [] trajectory = result.get("trajectory", []) @@ -291,6 +293,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() self.config.agent_framework, ) + print("HIT B", file=sys.stderr) + # If no trajectory or empty output, create a summary message if not output_items: output_items = [ @@ -328,6 +332,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() if "swe-bench-metrics" in result: metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"]) + print("HIT C", file=sys.stderr) return NeMoGymResponse( id=f"swebench-{problem_info.get('instance_id', 'unknown')}", created_at=int(time.time()), @@ -388,6 +393,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # Run the evaluation response = await self.responses(fixed_params) + print("HIT D", file=sys.stderr) + # Extract initial input messages from the response output and get filtered output # These are the system/user messages that were actually sent to the agent input_messages, filtered_output = extract_input_messages_from_trajectory(response.output) @@ -406,6 +413,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # Remove metadata from response after extracting metrics response = response.model_copy(update={"metadata": None}) + print("HIT E", file=sys.stderr) + # Parse metrics from JSON string if present metrics = json.loads(metadata.get("swe-bench-metrics", "{}")) if "swe-bench-metrics" in metadata else {} @@ -418,6 +427,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 + print("HIT F", file=sys.stderr) + # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( responses_create_params=params_with_input, From aa0b2c76bcd0b90c67548de32fc1d7f7b7ef81b1 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 10:22:55 -0800 Subject: [PATCH 102/127] clean Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 01e341983..9e6a9de2c 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -276,8 +276,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params) result = await future - print("HIT A", file=sys.stderr) - # Extract trajectory and convert to proper NeMoGym format output_items = [] trajectory = result.get("trajectory", []) @@ -293,8 +291,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() self.config.agent_framework, ) - print("HIT B", file=sys.stderr) - # If no trajectory or empty output, create a summary message if not output_items: output_items = [ @@ -332,7 +328,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() if "swe-bench-metrics" in result: metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"]) - print("HIT C", file=sys.stderr) return NeMoGymResponse( id=f"swebench-{problem_info.get('instance_id', 'unknown')}", created_at=int(time.time()), @@ -393,8 +388,6 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # Run the evaluation response = await self.responses(fixed_params) - print("HIT D", file=sys.stderr) - # Extract initial input messages from the response output and get filtered output # These are the system/user messages that were actually sent to the agent input_messages, filtered_output = extract_input_messages_from_trajectory(response.output) @@ -413,8 +406,6 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # Remove metadata from response after extracting metrics response = response.model_copy(update={"metadata": None}) - print("HIT E", file=sys.stderr) - # Parse metrics from JSON string if present metrics = json.loads(metadata.get("swe-bench-metrics", "{}")) if "swe-bench-metrics" in metadata else {} @@ -427,8 +418,6 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 - print("HIT F", file=sys.stderr) - # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( responses_create_params=params_with_input, From cbf9c35bd7da8e873846348af29fec271593a1ba Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 10:23:50 -0800 Subject: [PATCH 103/127] print each Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 9e6a9de2c..ca9be4808 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -418,6 +418,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 + print(f"{params_with_input=}\n{response=}\n{metrics=}", file=sys.stderr) + # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( responses_create_params=params_with_input, From d61f0ee0129f30d9038e8fef7d30db12894243a0 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 10:43:20 -0800 Subject: [PATCH 104/127] try model dump Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index ca9be4808..0f0e2016d 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -423,7 +423,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( responses_create_params=params_with_input, - response=response, + response=response.model_dump(), reward=reward, resolved=1.0 if resolved else 0.0, patch_exists=1.0 if patch_exists else 0.0, From 16085af32781a329d79b125deed1e4ef13d6036f Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 10:46:13 -0800 Subject: [PATCH 105/127] modeul dump again Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 0f0e2016d..d0024f1b2 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -418,11 +418,9 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 - print(f"{params_with_input=}\n{response=}\n{metrics=}", file=sys.stderr) - # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( - responses_create_params=params_with_input, + responses_create_params=params_with_input.model_dump(), response=response.model_dump(), reward=reward, resolved=1.0 if resolved else 0.0, From 95d08c150b903e8239b34859d09e6f0e8724d3a8 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 10:49:48 -0800 Subject: [PATCH 106/127] breakpoint Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index d0024f1b2..786a67339 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -418,6 +418,11 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 + params_with_input.model_dump() + response.model_dump() + + print("HIT BREAKPOINT") + # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( responses_create_params=params_with_input.model_dump(), From 0a9e25fac8b8baa952ffd33acda3cbe05b730484 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 10:53:08 -0800 Subject: [PATCH 107/127] stderr Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 786a67339..f70a32e4f 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -421,7 +421,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: params_with_input.model_dump() response.model_dump() - print("HIT BREAKPOINT") + print("HIT BREAKPOINT", file=sys.stderr) # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( From 4caf1173d2c978c78bf59afc092f467d11b82f43 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 10:57:14 -0800 Subject: [PATCH 108/127] separate breakpoints Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index f70a32e4f..d4b7da7a1 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -419,9 +419,9 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 params_with_input.model_dump() + print("HIT BREAKPOINT A", file=sys.stderr) response.model_dump() - - print("HIT BREAKPOINT", file=sys.stderr) + print("HIT BREAKPOINT B", file=sys.stderr) # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( From 1fc1a0864ce559145e23dad63703f01bd6481ce1 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 11:00:27 -0800 Subject: [PATCH 109/127] try moppdel dump Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index d4b7da7a1..3b8c23b70 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -398,7 +398,10 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # Add the extracted input messages and tools to the params # Note: tools should already be in the correct format from the response params_with_input = fixed_params.model_copy( - update={"input": input_messages, "tools": response.tools if response.tools else []} + update={ + "input": input_messages, + "tools": [t.model_dump() for t in response.tools] if response.tools else [], + } ) # Extract metrics from response metadata @@ -425,8 +428,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( - responses_create_params=params_with_input.model_dump(), - response=response.model_dump(), + responses_create_params=params_with_input, + response=response, reward=reward, resolved=1.0 if resolved else 0.0, patch_exists=1.0 if patch_exists else 0.0, From 0bf54608d2d2b12eba436349502f973ded3ce359 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 11:05:43 -0800 Subject: [PATCH 110/127] print Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 3b8c23b70..55c562705 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -366,7 +366,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: """Run and verify SWE-bench solution.""" async with self.sem: - print(f"Semaphore: {self.config.concurrency - self.sem._value} / {self.config.concurrency}", flush=True) + if self.config.debug: + print( + f"Semaphore: {self.config.concurrency - self.sem._value} / {self.config.concurrency}", flush=True + ) body.responses_create_params.metadata["container_concurrency"] = self.config.concurrency - self.sem._value # Fix None values in responses_create_params to use defaults @@ -421,10 +424,9 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 - params_with_input.model_dump() + print(params_with_input.model_dump(), file=sys.stderr) + print(params_with_input.metadata, file=sys.stderr) print("HIT BREAKPOINT A", file=sys.stderr) - response.model_dump() - print("HIT BREAKPOINT B", file=sys.stderr) # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( From 466690b16a6fe20877eadf59cf7e377d01a2ed24 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 11:20:26 -0800 Subject: [PATCH 111/127] print type v Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 55c562705..afd5ec812 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -426,6 +426,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: print(params_with_input.model_dump(), file=sys.stderr) print(params_with_input.metadata, file=sys.stderr) + for k, v in params_with_input.metadata.items(): + print(f"{k}: {type(v)}") print("HIT BREAKPOINT A", file=sys.stderr) # Build verification response with top-level numeric fields for statistics From 43554f9f8562f6336e752abf9b93b267646366b2 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 11:24:36 -0800 Subject: [PATCH 112/127] stderr Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index afd5ec812..cee4080d0 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -427,7 +427,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: print(params_with_input.model_dump(), file=sys.stderr) print(params_with_input.metadata, file=sys.stderr) for k, v in params_with_input.metadata.items(): - print(f"{k}: {type(v)}") + print(f"{k}: {type(v)}", file=sys.stderr) print("HIT BREAKPOINT A", file=sys.stderr) # Build verification response with top-level numeric fields for statistics From 56384c68dc0eddce9f57a3c1c092c0f0fb221d55 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 11:27:15 -0800 Subject: [PATCH 113/127] resolve metadata Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index cee4080d0..72d1c0c42 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -370,7 +370,9 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: print( f"Semaphore: {self.config.concurrency - self.sem._value} / {self.config.concurrency}", flush=True ) - body.responses_create_params.metadata["container_concurrency"] = self.config.concurrency - self.sem._value + body.responses_create_params.metadata["container_concurrency"] = str( + self.config.concurrency - self.sem._value + ) # Fix None values in responses_create_params to use defaults # This is needed because the pydantic model has non-Optional fields with defaults @@ -424,12 +426,6 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 - print(params_with_input.model_dump(), file=sys.stderr) - print(params_with_input.metadata, file=sys.stderr) - for k, v in params_with_input.metadata.items(): - print(f"{k}: {type(v)}", file=sys.stderr) - print("HIT BREAKPOINT A", file=sys.stderr) - # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( responses_create_params=params_with_input, From 56fdd0024bfa22add736bb7b6bf79f8430261865 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 13:47:19 -0800 Subject: [PATCH 114/127] openhands hsould log Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 ++ responses_api_agents/swe_agents/run_openhands.py | 8 ++++++-- responses_api_agents/swe_agents/utils.py | 2 ++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 72d1c0c42..4f4773690 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -174,6 +174,7 @@ class SWEBenchWrapperConfig(BaseResponsesAPIAgentConfig): description="Session ID for the run", ) + openhands_should_log: bool = False debug: bool = False @@ -267,6 +268,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "r2e_gym_setup_dir": self.config.r2e_gym_setup_dir, "dataset_path": self.config.dataset_path, "ray_queue_time": ray_queue_time, + "openhands_should_log": self.config.openhands_should_log, "debug": self.config.debug, "apptainer_memory_limit_mb": self.config.apptainer_memory_limit_mb, "command_exec_timeout": self.config.command_exec_timeout, diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 11e1611ae..2bdf6ef11 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -102,6 +102,7 @@ class RunOpenHandsAgent: swebench_setup_dir: Path | None = None r2e_gym_setup_dir: Path | None = None dataset_path: str | None = None + openhands_should_log: bool = False debug: bool = False async def _run_swe_agent(self, data_point, api_base): @@ -230,8 +231,12 @@ async def _run_openhands( agent_script_name = f"agent_script_{agent_run_id}.sh" if self.debug: - log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && " profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && " + else: + profiling_cmd = "" + + if self.openhands_should_log: + log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && export NG_OPENHANDS_SHOULD_LOG=true && " else: log_cmd = ( "export LOG_LEVEL=CRITICAL && " @@ -241,7 +246,6 @@ async def _run_openhands( "export LOG_ALL_EVENTS=False && " "export DEBUG_RUNTIME=False && " ) - profiling_cmd = "" agent_main_cmd = ( "if [ -d /workspace ]; then " diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 80f39c061..14dc7912f 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -642,6 +642,7 @@ async def run_swebench_evaluation( dataset_path: Optional[str] = None, ray_queue_time: Optional[float] = None, ray_submit_time: Optional[float] = None, + openhands_should_log: bool = False, debug: bool = False, apptainer_memory_limit_mb: Optional[int] = None, command_exec_timeout: Optional[int] = None, @@ -687,6 +688,7 @@ async def run_swebench_evaluation( swebench_setup_dir=swebench_setup_dir, r2e_gym_setup_dir=r2e_gym_setup_dir, dataset_path=dataset_path, + openhands_should_log=openhands_should_log, debug=debug, ) From dccd50e3da1fa2aefcca8b16cb98702844b89e2b Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Mon, 26 Jan 2026 13:58:07 -0800 Subject: [PATCH 115/127] pipe global config dict Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 6 ++++++ responses_api_agents/swe_agents/run_openhands.py | 2 ++ responses_api_agents/swe_agents/utils.py | 2 ++ 3 files changed, 10 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 4f4773690..d729a63c1 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -14,6 +14,7 @@ import asyncio import json import os +import shlex import sys import time import uuid @@ -36,6 +37,7 @@ SimpleResponsesAPIAgent, ) from nemo_gym.config_types import ModelServerRef +from nemo_gym.global_config import OmegaConf, get_global_config_dict from nemo_gym.openai_utils import ( NeMoGymResponse, NeMoGymResponseCreateParamsNonStreaming, @@ -210,6 +212,7 @@ class SWEBenchWrapper(SimpleResponsesAPIAgent): config: SWEBenchWrapperConfig sem: Semaphore = None _container_counter: ConcurrentContainerCounter = None + _global_config_dict_str: str = None model_config = ConfigDict(arbitrary_types_allowed=True) def model_post_init(self, __context: Any) -> None: @@ -231,6 +234,8 @@ def model_post_init(self, __context: Any) -> None: self.config.run_session_id = f"{int(time.time() * 1000)}_{str(uuid.uuid4())[:8]}" print(f"Run session ID: {self.config.run_session_id}", flush=True) + self._global_config_dict_str = shlex.quote(OmegaConf.to_yaml(get_global_config_dict())) + async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()) -> NeMoGymResponse: # Extract problem information from request problem_info = extract_problem_info( @@ -270,6 +275,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "ray_queue_time": ray_queue_time, "openhands_should_log": self.config.openhands_should_log, "debug": self.config.debug, + "ng_global_config_dict_str": self._global_config_dict_str, "apptainer_memory_limit_mb": self.config.apptainer_memory_limit_mb, "command_exec_timeout": self.config.command_exec_timeout, } diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 2bdf6ef11..6d8e4a203 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -97,6 +97,7 @@ class SweBenchGenerationConfig: @dataclass class RunOpenHandsAgent: cfg: SweBenchGenerationConfig + ng_global_config_dict_str: str output_dir: str = None openhands_setup_dir: Path | None = None swebench_setup_dir: Path | None = None @@ -270,6 +271,7 @@ async def _run_openhands( "export RUNTIME=local && " f"{log_cmd}" f"{profiling_cmd}" + f"export NEMO_GYM_CONFIG_DICT={self.ng_global_config_dict_str} && " "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && " "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && " # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 14dc7912f..791876a00 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -634,6 +634,7 @@ async def run_swebench_evaluation( swebench_tests_timeout: int, swebench_agent_timeout: int, persistent_dir: Path, + ng_global_config_dict_str: str, agent_framework_repo: Optional[str] = None, agent_framework_commit: str = "HEAD", openhands_setup_dir: Optional[Path] = None, @@ -688,6 +689,7 @@ async def run_swebench_evaluation( swebench_setup_dir=swebench_setup_dir, r2e_gym_setup_dir=r2e_gym_setup_dir, dataset_path=dataset_path, + ng_global_config_dict_str=ng_global_config_dict_str, openhands_should_log=openhands_should_log, debug=debug, ) From 26dcf232c4f57dcce84c04173ddde543869182fe Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 27 Jan 2026 09:54:27 -0800 Subject: [PATCH 116/127] use num cpus 1 Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index d729a63c1..78e010016 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -83,7 +83,7 @@ def decrement(self): runtime_env={ "py_executable": sys.executable, }, - num_cpus=0.5, + num_cpus=1, ) def runner_ray_remote( concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any] From 92e00fd01d48c1eef013716e9f31e8b9864604f8 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 27 Jan 2026 21:04:42 -0800 Subject: [PATCH 117/127] pipe model name Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 1 + responses_api_agents/swe_agents/run_openhands.py | 2 ++ responses_api_agents/swe_agents/utils.py | 2 ++ 3 files changed, 5 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 78e010016..3376ede08 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -275,6 +275,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "ray_queue_time": ray_queue_time, "openhands_should_log": self.config.openhands_should_log, "debug": self.config.debug, + "model_server_name": self.config.model_server.name, "ng_global_config_dict_str": self._global_config_dict_str, "apptainer_memory_limit_mb": self.config.apptainer_memory_limit_mb, "command_exec_timeout": self.config.command_exec_timeout, diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 6d8e4a203..c5cdcdfdf 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -98,6 +98,7 @@ class SweBenchGenerationConfig: class RunOpenHandsAgent: cfg: SweBenchGenerationConfig ng_global_config_dict_str: str + model_server_name: str output_dir: str = None openhands_setup_dir: Path | None = None swebench_setup_dir: Path | None = None @@ -272,6 +273,7 @@ async def _run_openhands( f"{log_cmd}" f"{profiling_cmd}" f"export NEMO_GYM_CONFIG_DICT={self.ng_global_config_dict_str} && " + f"export NEMO_GYM_MODEL_SERVER_NAME={self.model_server_name} &&" "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && " "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && " # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs) diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 791876a00..61d38085f 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -635,6 +635,7 @@ async def run_swebench_evaluation( swebench_agent_timeout: int, persistent_dir: Path, ng_global_config_dict_str: str, + model_server_name: str, agent_framework_repo: Optional[str] = None, agent_framework_commit: str = "HEAD", openhands_setup_dir: Optional[Path] = None, @@ -692,6 +693,7 @@ async def run_swebench_evaluation( ng_global_config_dict_str=ng_global_config_dict_str, openhands_should_log=openhands_should_log, debug=debug, + model_server_name=model_server_name, ) result = await run_oh.process_single_datapoint(problem_info, persistent_dir) From 1d6ce57f0a5ac77006a178587f10836b99a47d68 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:03:34 -0800 Subject: [PATCH 118/127] start add profiling metrics Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 3376ede08..d5b3aff98 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -205,6 +205,27 @@ class SWEBenchVerifyResponse(BaseVerifyResponse): patch_exists: Optional[float] = None # 1.0 if patch exists, 0.0 otherwise patch_successfully_applied: Optional[float] = None # 1.0 if patch applied, 0.0 otherwise + # Profiling time metrics to report + # ray_queue_time: float + # generation_apptainer_spinup_time: float + # create_runtime_time: float + # container_initialization_time: float + # connect_to_runtime_time: float + # runtime_initialization_fn_time: float + # total_command_exec_time: float + # total_model_call_time: float + # final_eval_apptainer_spinup_time: float + # final_eval_time: float + + # Exit condition metrics to report + # TODO add more exit conditions + # hit_sample_timeout: bool + # hit_trajectory_command_exec_timeout: bool + # hit_eval_timeout: bool + # hit_results_parsing_failure: bool + # hit_success: bool + # hit_unknown: bool + class SWEBenchWrapper(SimpleResponsesAPIAgent): """Wrapper for NeMo-Skills SWE-bench evaluation in NeMo-Gym.""" @@ -326,6 +347,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "agent_framework": self.config.agent_framework, "has_trajectory": str(trajectory is not None), "instance_id": result.get("instance_id", problem_info.get("instance_id", "unknown")), + "instance_dir": instance_dir, } # Add evaluation results to metadata (convert to strings) From b8212434e5cd31cbd4a95a2de8f819a434b4466d Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:05:25 -0800 Subject: [PATCH 119/127] add placeholder Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index d5b3aff98..5fa0f32c1 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -473,6 +473,22 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: "patch_successfully_applied": patch_applied, "resolved": resolved, }, + # ray_queue_time=, + # generation_apptainer_spinup_time=, + # create_runtime_time=, + # container_initialization_time=, + # connect_to_runtime_time=, + # runtime_initialization_fn_time=, + # total_command_exec_time=, + # total_model_call_time=, + # final_eval_apptainer_spinup_time=, + # final_eval_time=, + # hit_sample_timeout=, + # hit_trajectory_command_exec_timeout=, + # hit_eval_timeout=, + # hit_results_parsing_failure=, + # hit_success=, + # hit_unknown=, ) From 2f49809fba2ce9283c9f80a1f9919741e028df20 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:19:55 -0800 Subject: [PATCH 120/127] hit success Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 5fa0f32c1..050126834 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -348,6 +348,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "has_trajectory": str(trajectory is not None), "instance_id": result.get("instance_id", problem_info.get("instance_id", "unknown")), "instance_dir": instance_dir, + "hit_success_str": json.dumps(bool(output_items)), } # Add evaluation results to metadata (convert to strings) @@ -391,7 +392,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() parallel_tool_calls=False, tool_choice="none", tools=[], - metadata={"error": str(e)}, + metadata={ + "error": str(e), + "hit_success_str": json.dumps(False), + }, ) async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: @@ -487,7 +491,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # hit_trajectory_command_exec_timeout=, # hit_eval_timeout=, # hit_results_parsing_failure=, - # hit_success=, + hit_success=json.loads(metadata["hit_success_str"]), # hit_unknown=, ) From 7e018e4a97fc0144892da3552f02a5e53607ba96 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:20:05 -0800 Subject: [PATCH 121/127] hit success Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 050126834..382e8aa3c 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -223,7 +223,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse): # hit_trajectory_command_exec_timeout: bool # hit_eval_timeout: bool # hit_results_parsing_failure: bool - # hit_success: bool + hit_success: bool # hit_unknown: bool From 2a73772e4050ed581bdb571cce938dce1d3b3da4 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:24:55 -0800 Subject: [PATCH 122/127] hit unknown Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 382e8aa3c..a573c21ab 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -224,7 +224,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse): # hit_eval_timeout: bool # hit_results_parsing_failure: bool hit_success: bool - # hit_unknown: bool + hit_unknown: bool class SWEBenchWrapper(SimpleResponsesAPIAgent): @@ -461,6 +461,15 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 + hit_metrics = dict( + # hit_sample_timeout=, + # hit_trajectory_command_exec_timeout=, + # hit_eval_timeout=, + # hit_results_parsing_failure=, + hit_success=json.loads(metadata["hit_success_str"]), + ) + hit_metrics["hit_unknown"] = not any(hit_metrics.values()) + # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( responses_create_params=params_with_input, @@ -487,12 +496,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # total_model_call_time=, # final_eval_apptainer_spinup_time=, # final_eval_time=, - # hit_sample_timeout=, - # hit_trajectory_command_exec_timeout=, - # hit_eval_timeout=, - # hit_results_parsing_failure=, - hit_success=json.loads(metadata["hit_success_str"]), - # hit_unknown=, + **hit_metrics, ) From d648e56217c66775da197fcd82d5a581bf30a1cd Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:33:36 -0800 Subject: [PATCH 123/127] hit_empty_trajectory Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index a573c21ab..f04ada4ec 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -222,7 +222,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse): # hit_sample_timeout: bool # hit_trajectory_command_exec_timeout: bool # hit_eval_timeout: bool - # hit_results_parsing_failure: bool + hit_empty_trajectory: bool hit_success: bool hit_unknown: bool @@ -349,6 +349,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "instance_id": result.get("instance_id", problem_info.get("instance_id", "unknown")), "instance_dir": instance_dir, "hit_success_str": json.dumps(bool(output_items)), + "hit_empty_trajectory_str": json.dumps(not trajectory), } # Add evaluation results to metadata (convert to strings) @@ -395,6 +396,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() metadata={ "error": str(e), "hit_success_str": json.dumps(False), + "hit_empty_trajectory_str": json.dumps((not trajectory) if "trajectory" in dir() else False), }, ) @@ -466,6 +468,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: # hit_trajectory_command_exec_timeout=, # hit_eval_timeout=, # hit_results_parsing_failure=, + hit_empty_trajectory=json.loads(metadata["hit_empty_trajectory_str"]), hit_success=json.loads(metadata["hit_success_str"]), ) hit_metrics["hit_unknown"] = not any(hit_metrics.values()) From 0ab3b8606add5a31f13777404768a758431d88de Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:38:10 -0800 Subject: [PATCH 124/127] hit_responses_exception Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index f04ada4ec..64d8b137b 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -224,7 +224,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse): # hit_eval_timeout: bool hit_empty_trajectory: bool hit_success: bool - hit_unknown: bool + hit_responses_exception: bool class SWEBenchWrapper(SimpleResponsesAPIAgent): @@ -350,6 +350,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "instance_dir": instance_dir, "hit_success_str": json.dumps(bool(output_items)), "hit_empty_trajectory_str": json.dumps(not trajectory), + "hit_responses_exception_str": json.dumps(False), } # Add evaluation results to metadata (convert to strings) @@ -397,6 +398,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "error": str(e), "hit_success_str": json.dumps(False), "hit_empty_trajectory_str": json.dumps((not trajectory) if "trajectory" in dir() else False), + "hit_responses_exception_str": json.dumps(True), }, ) @@ -463,15 +465,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: reward = 1.0 if resolved else 0.0 - hit_metrics = dict( - # hit_sample_timeout=, - # hit_trajectory_command_exec_timeout=, - # hit_eval_timeout=, - # hit_results_parsing_failure=, - hit_empty_trajectory=json.loads(metadata["hit_empty_trajectory_str"]), - hit_success=json.loads(metadata["hit_success_str"]), - ) - hit_metrics["hit_unknown"] = not any(hit_metrics.values()) + hit_metrics = {k.removesuffix("_str"): json.loads(v) for k, v in metadata.items() if k.startswith("hit_")} # Build verification response with top-level numeric fields for statistics return SWEBenchVerifyResponse( From 769c155825c9a0aba003489c10edc2ce85cc2045 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:42:08 -0800 Subject: [PATCH 125/127] plumb NEMO_GYM_METRICS_FPATH Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 ++ responses_api_agents/swe_agents/run_openhands.py | 2 ++ responses_api_agents/swe_agents/utils.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 64d8b137b..658578518 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -274,6 +274,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() workspace_root = Path(os.path.dirname(os.path.abspath(__file__))) persistent_dir = workspace_root / f"swebench_results_{self.config.run_session_id}" / instance_dir persistent_dir.mkdir(parents=True, exist_ok=True) + metrics_fpath = persistent_dir / "nemo_gym_metrics.json" try: ray_queue_time = time.time() params = { @@ -287,6 +288,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() "swebench_tests_timeout": self.config.swebench_tests_timeout, "swebench_agent_timeout": self.config.swebench_agent_timeout, "persistent_dir": persistent_dir, + "metrics_fpath": metrics_fpath, "agent_framework_repo": self.config.agent_framework_repo, "agent_framework_commit": self.config.agent_framework_commit, "openhands_setup_dir": self.config.openhands_setup_dir, diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index c5cdcdfdf..073097678 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -106,6 +106,7 @@ class RunOpenHandsAgent: dataset_path: str | None = None openhands_should_log: bool = False debug: bool = False + metrics_fpath: Path async def _run_swe_agent(self, data_point, api_base): """ @@ -272,6 +273,7 @@ async def _run_openhands( "export RUNTIME=local && " f"{log_cmd}" f"{profiling_cmd}" + f"export NEMO_GYM_METRICS_FPATH={self.metrics_fpath} && " f"export NEMO_GYM_CONFIG_DICT={self.ng_global_config_dict_str} && " f"export NEMO_GYM_MODEL_SERVER_NAME={self.model_server_name} &&" "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && " diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py index 61d38085f..5f08d3a7e 100644 --- a/responses_api_agents/swe_agents/utils.py +++ b/responses_api_agents/swe_agents/utils.py @@ -634,6 +634,7 @@ async def run_swebench_evaluation( swebench_tests_timeout: int, swebench_agent_timeout: int, persistent_dir: Path, + metrics_fpath: Path, ng_global_config_dict_str: str, model_server_name: str, agent_framework_repo: Optional[str] = None, @@ -694,6 +695,7 @@ async def run_swebench_evaluation( openhands_should_log=openhands_should_log, debug=debug, model_server_name=model_server_name, + metrics_fpath=metrics_fpath, ) result = await run_oh.process_single_datapoint(problem_info, persistent_dir) From 3a42aef4cddd4faddaf4079fda8537f3e00ad8e1 Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 10:50:15 -0800 Subject: [PATCH 126/127] report time metrics Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 658578518..711f5c1a3 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -91,6 +91,10 @@ def runner_ray_remote( ray_submit_time = time.time() params["ray_submit_time"] = ray_submit_time + # This is the first instance so we don't need to load anything + with params["metrics_fpath"].open("w") as f: + json.dump({"ray_queue_time": ray_submit_time - params["ray_queue_time"]}, f) + if params["debug"]: concurrent_containers = ray.get(concurrent_container_counter.increment.remote()) print(f"Concurrent container #{concurrent_containers}", file=sys.stderr) @@ -206,7 +210,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse): patch_successfully_applied: Optional[float] = None # 1.0 if patch applied, 0.0 otherwise # Profiling time metrics to report - # ray_queue_time: float + ray_queue_time: float # generation_apptainer_spinup_time: float # create_runtime_time: float # container_initialization_time: float @@ -364,6 +368,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body() if "swe-bench-metrics" in result: metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"]) + metadata["timing_metrics"] = metrics_fpath.read_text() + return NeMoGymResponse( id=f"swebench-{problem_info.get('instance_id', 'unknown')}", created_at=int(time.time()), @@ -485,16 +491,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse: "patch_successfully_applied": patch_applied, "resolved": resolved, }, - # ray_queue_time=, - # generation_apptainer_spinup_time=, - # create_runtime_time=, - # container_initialization_time=, - # connect_to_runtime_time=, - # runtime_initialization_fn_time=, - # total_command_exec_time=, - # total_model_call_time=, - # final_eval_apptainer_spinup_time=, - # final_eval_time=, + **json.loads(metadata["timing_metrics"]), **hit_metrics, ) From 40874f8b8e61a9c033937384e75d78e84c89e09d Mon Sep 17 00:00:00 2001 From: Brian Yu Date: Tue, 3 Feb 2026 13:01:44 -0800 Subject: [PATCH 127/127] final eval time Signed-off-by: Brian Yu --- responses_api_agents/swe_agents/app.py | 2 +- responses_api_agents/swe_agents/run_openhands.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py index 711f5c1a3..bc27ad071 100644 --- a/responses_api_agents/swe_agents/app.py +++ b/responses_api_agents/swe_agents/app.py @@ -219,7 +219,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse): # total_command_exec_time: float # total_model_call_time: float # final_eval_apptainer_spinup_time: float - # final_eval_time: float + final_eval_time: float # Exit condition metrics to report # TODO add more exit conditions diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py index 073097678..8007cd492 100644 --- a/responses_api_agents/swe_agents/run_openhands.py +++ b/responses_api_agents/swe_agents/run_openhands.py @@ -1116,6 +1116,10 @@ async def process_single_datapoint(self, data_point: dict[str, Any], persistent_ "evaluation_time": evaluation_time, } + nemo_gym_metrics = json.loads(self.metrics_fpath.read_text()) + with self.metrics_fpath.open("w") as f: + json.dump(nemo_gym_metrics | {"final_eval_time": evaluation_time}, f) + return output_dict finally: self._cleanup_instance_dataset(instance_dataset_path)