From 9a478effba7d3b837765ace2de51fd277c066b41 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 20 Jan 2026 21:37:03 -0800
Subject: [PATCH 001/127] print client response error on debug

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/server_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index a7ca1f2f8..58ae1337f 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -433,6 +433,9 @@ async def exception_handling_middleware(request: Request, call_next):
                 )
 
                 response_content = f"Hit an exception in {self.get_session_middleware_key()} calling an inner server: {e.response_content}"
+                if _GLOBAL_AIOHTTP_CLIENT_REQUEST_DEBUG:
+                    print(response_content)
+
                 return JSONResponse(content=response_content, status_code=500)
             except Exception as e:
                 print(

From 5c0f1588817545326c592a5c7d7783a0a8a84536 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:08:46 -0800
Subject: [PATCH 002/127] print result

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index a1e1557aa..610ed4faa 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -212,6 +212,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
             result = await future
 
+            # TODO remove
+            print(result)
+
             # Extract trajectory and convert to proper NeMoGym format
             output_items = []
             trajectory = result.get("trajectory", [])

From ec0f4d4bf31f47bff5876e9d4baf0a2518a1f98c Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:15:53 -0800
Subject: [PATCH 003/127] print params

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 610ed4faa..5e7cd5d12 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -209,6 +209,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "instance_dir": instance_dir,
             }
 
+            # TODO remove
+            print(params)
+
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
             result = await future
 

From f3b8e3bc777421ebaa76437dbcb8814847dc9ca8 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:18:20 -0800
Subject: [PATCH 004/127] pritn

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 376b4f153..939808e16 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -684,6 +684,10 @@ async def run_swebench_evaluation(
         r2e_gym_setup_dir=r2e_gym_setup_dir,
         dataset_path=dataset_path,
     )
+
+    # TODO remove
+    print("Hit before run_oh")
+
     result = await run_oh.process_single_datapoint(problem_info)
     print(f"Process completed for {instance_id}", flush=True)
 

From 8b9b2877807ba2320db27b174e256ca980ddf4a6 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:23:00 -0800
Subject: [PATCH 005/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 939808e16..32527527c 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -640,6 +640,9 @@ async def run_swebench_evaluation(
     dataset_path: Optional[str] = None,
     instance_dir: Optional[str] = None,
 ) -> Dict:
+    # TODO remove
+    print("Hit inside")
+
     # Create persistent directory for I/O and logs in local workspace
     workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
     instance_id = problem_info.get("instance_id", "unknown")

From 86b6fd91180739a402e013dd095355ef47fcb899 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:24:37 -0800
Subject: [PATCH 006/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py   | 6 ------
 responses_api_agents/swe_agents/utils.py | 6 ------
 2 files changed, 12 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 5e7cd5d12..a1e1557aa 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -209,15 +209,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "instance_dir": instance_dir,
             }
 
-            # TODO remove
-            print(params)
-
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
             result = await future
 
-            # TODO remove
-            print(result)
-
             # Extract trajectory and convert to proper NeMoGym format
             output_items = []
             trajectory = result.get("trajectory", [])
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 32527527c..b8cd5604f 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -640,9 +640,6 @@ async def run_swebench_evaluation(
     dataset_path: Optional[str] = None,
     instance_dir: Optional[str] = None,
 ) -> Dict:
-    # TODO remove
-    print("Hit inside")
-
     # Create persistent directory for I/O and logs in local workspace
     workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
     instance_id = problem_info.get("instance_id", "unknown")
@@ -688,9 +685,6 @@ async def run_swebench_evaluation(
         dataset_path=dataset_path,
     )
 
-    # TODO remove
-    print("Hit before run_oh")
-
     result = await run_oh.process_single_datapoint(problem_info)
     print(f"Process completed for {instance_id}", flush=True)
 

From f46109f20918e66e686697af494f53c6907e4f5a Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:25:41 -0800
Subject: [PATCH 007/127] try traceback

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index a1e1557aa..0cc7a6729 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -298,6 +298,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 tools=[],
                 metadata={"error": str(e)},
             )
+        except:
+            import traceback
+
+            traceback.print_exc()
 
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
         """Run and verify SWE-bench solution."""

From 8971ad50ce1e5af5d963b8d14556c14da5d53816 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:27:19 -0800
Subject: [PATCH 008/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 0cc7a6729..ee723a42e 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -60,6 +60,8 @@
     },
 )
 def runner_ray_remote(runner: Callable, params: dict[str, Any]) -> Any:
+    # TODO remove
+    print("Hit in runner_ray_remote")
     return asyncio.run(runner(**params))
 
 
@@ -298,10 +300,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 tools=[],
                 metadata={"error": str(e)},
             )
-        except:
-            import traceback
-
-            traceback.print_exc()
 
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
         """Run and verify SWE-bench solution."""

From 11eef9042ebc3ba3175337b763495980350dcf46 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:29:11 -0800
Subject: [PATCH 009/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index ee723a42e..da3d5921a 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -60,8 +60,6 @@
     },
 )
 def runner_ray_remote(runner: Callable, params: dict[str, Any]) -> Any:
-    # TODO remove
-    print("Hit in runner_ray_remote")
     return asyncio.run(runner(**params))
 
 
@@ -212,6 +210,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             }
 
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
+            # TODO remove
+            print("FUTURE", future)
+
             result = await future
 
             # Extract trajectory and convert to proper NeMoGym format

From 6d384fc669d58e114260f0341fdb183ed07fa8d0 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:33:36 -0800
Subject: [PATCH 010/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index da3d5921a..23fd354d8 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -209,6 +209,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "instance_dir": instance_dir,
             }
 
+            print("BEFORE REMOTE", runner_ray_remote, run_swebench_evaluation, len(params))
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
             # TODO remove
             print("FUTURE", future)

From 91271d5f0c52f106a6ae7c6c5c4880d1dcad3acb Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:35:24 -0800
Subject: [PATCH 011/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 23fd354d8..c6e0009fc 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -209,10 +209,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "instance_dir": instance_dir,
             }
 
-            print("BEFORE REMOTE", runner_ray_remote, run_swebench_evaluation, len(params))
+            print("BEFORE REMOTE", runner_ray_remote, run_swebench_evaluation, len(params), flush=True)
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
             # TODO remove
-            print("FUTURE", future)
+            print("FUTURE", future, flush=True)
 
             result = await future
 
@@ -306,6 +306,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
         """Run and verify SWE-bench solution."""
         async with self.sem:
+            # TODO remove
+            print("hit inside run", flush=True)
+
             # Fix None values in responses_create_params to use defaults
             # This is needed because the pydantic model has non-Optional fields with defaults
 

From 35cb2281e5f8fd1a082be339e1e7e293877a6501 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:36:57 -0800
Subject: [PATCH 012/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index c6e0009fc..a1e1557aa 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -209,11 +209,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "instance_dir": instance_dir,
             }
 
-            print("BEFORE REMOTE", runner_ray_remote, run_swebench_evaluation, len(params), flush=True)
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
-            # TODO remove
-            print("FUTURE", future, flush=True)
-
             result = await future
 
             # Extract trajectory and convert to proper NeMoGym format
@@ -306,9 +302,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
         """Run and verify SWE-bench solution."""
         async with self.sem:
-            # TODO remove
-            print("hit inside run", flush=True)
-
             # Fix None values in responses_create_params to use defaults
             # This is needed because the pydantic model has non-Optional fields with defaults
 

From cb0ff0b842bbde6fbe551d85221d4421afa6163c Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:37:43 -0800
Subject: [PATCH 013/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index a1e1557aa..8bb414331 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -212,6 +212,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
             result = await future
 
+            # TODO remove
+            print("RESULT", result, flush=True)
+
             # Extract trajectory and convert to proper NeMoGym format
             output_items = []
             trajectory = result.get("trajectory", [])
@@ -277,6 +280,11 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             )
 
         except Exception as e:
+            # TODO remove
+            import traceback
+
+            print(traceback.format_exc(), flush=True)
+
             print(f"SWE-bench evaluation failed: {str(e)}", flush=True)
             # Return error response
             error_message = NeMoGymResponseOutputMessage(
@@ -298,6 +306,11 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 tools=[],
                 metadata={"error": str(e)},
             )
+        except:
+            # TODO remove
+            import traceback
+
+            print(traceback.format_exc(), flush=True)
 
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
         """Run and verify SWE-bench solution."""

From badddbaac26811ce9e4c1dfbb69ade88039de43a Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:40:04 -0800
Subject: [PATCH 014/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index b8cd5604f..bd2146838 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -640,6 +640,9 @@ async def run_swebench_evaluation(
     dataset_path: Optional[str] = None,
     instance_dir: Optional[str] = None,
 ) -> Dict:
+    # TODO remove
+    print("Hit inside run_swebench_evaluation", problem_info, flush=True)
+
     # Create persistent directory for I/O and logs in local workspace
     workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
     instance_id = problem_info.get("instance_id", "unknown")
@@ -685,6 +688,8 @@ async def run_swebench_evaluation(
         dataset_path=dataset_path,
     )
 
+    # TODO remove
+    print("Hit before process_single_datapoint", problem_info, flush=True)
     result = await run_oh.process_single_datapoint(problem_info)
     print(f"Process completed for {instance_id}", flush=True)
 

From 4e239f74e760b05a9070f780d911490e46668c63 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:43:18 -0800
Subject: [PATCH 015/127] print exc

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index bd2146838..d691551ce 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -640,9 +640,6 @@ async def run_swebench_evaluation(
     dataset_path: Optional[str] = None,
     instance_dir: Optional[str] = None,
 ) -> Dict:
-    # TODO remove
-    print("Hit inside run_swebench_evaluation", problem_info, flush=True)
-
     # Create persistent directory for I/O and logs in local workspace
     workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
     instance_id = problem_info.get("instance_id", "unknown")
@@ -688,9 +685,14 @@ async def run_swebench_evaluation(
         dataset_path=dataset_path,
     )
 
-    # TODO remove
-    print("Hit before process_single_datapoint", problem_info, flush=True)
-    result = await run_oh.process_single_datapoint(problem_info)
+    try:
+        result = await run_oh.process_single_datapoint(problem_info)
+    except:
+        # TODO remove
+        import traceback
+
+        print("Hit exception in process_single_datapoint", traceback.format_exc(), flush=True)
+
     print(f"Process completed for {instance_id}", flush=True)
 
     try:

From 890376e4322d726a16a9cb585d41b7f4eaf8171f Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:44:50 -0800
Subject: [PATCH 016/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 5 +++++
 responses_api_agents/swe_agents/utils.py         | 9 +--------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 16e30ec4f..ea01ab73c 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -1025,5 +1025,10 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
             }
 
             return output_dict
+        except:
+            # TODO remove
+            import traceback
+
+            print("Hit exception in process_single_datapoint", traceback.format_exc(), flush=True)
         finally:
             self._cleanup_instance_dataset(instance_dataset_path)
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index d691551ce..b8cd5604f 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -685,14 +685,7 @@ async def run_swebench_evaluation(
         dataset_path=dataset_path,
     )
 
-    try:
-        result = await run_oh.process_single_datapoint(problem_info)
-    except:
-        # TODO remove
-        import traceback
-
-        print("Hit exception in process_single_datapoint", traceback.format_exc(), flush=True)
-
+    result = await run_oh.process_single_datapoint(problem_info)
     print(f"Process completed for {instance_id}", flush=True)
 
     try:

From 702e2998f9e5c5223aa712e1aa1da2b954a158c5 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:49:32 -0800
Subject: [PATCH 017/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index ea01ab73c..bcf677bec 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -909,6 +909,7 @@ def check_tests_passed(
         return required_tests <= passed_tests
 
     async def process_single_datapoint(self, data_point: dict[str, Any]):
+        print("HIT 1", flush=True)
         self.output_dir = Path(self.cfg.output_file).parent
 
         agent_run_id = f"{data_point['instance_id']}_{int(time.time())}_{str(uuid.uuid4())[:8]}"
@@ -921,12 +922,14 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
         trajectory_dict = None
         try:
             if self.cfg.agent_framework == SupportedAgentFrameworks.swe_agent:
+                print("HIT 2", flush=True)
                 pred_file = await self._run_swe_agent(
                     data_point,
                     api_base,
                     instance_dataset_path,
                 )
             elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands:
+                print("HIT 3", flush=True)
                 pred_file = await self._run_openhands(
                     data_point,
                     api_base,
@@ -934,14 +937,17 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                     instance_dataset_path,
                 )
             else:
+                print("HIT 4", flush=True)
                 raise ValueError(
                     f"Unsupported agent framework: {self.cfg.agent_framework}. "
                     f"Supported frameworks: {', '.join(SupportedAgentFrameworks)}."
                 )
 
+            print("HIT 5", flush=True)
             generation_time = asyncio.get_running_loop().time() - start_time
 
             if pred_file is None:
+                print("HIT 6", flush=True)
                 report_json = {
                     data_point["instance_id"]: {
                         "resolved": False,
@@ -952,6 +958,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                     }
                 }
             else:
+                print("HIT 7", flush=True)
                 pred_mounted_path = pred_file.replace(str(self.output_dir), "/trajectories_mount")
                 with open(pred_file, "r") as f:
                     trajectory_dict = json.loads(f.read())
@@ -960,6 +967,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                 has_patch = trajectory_dict["model_patch"] is not None
 
                 if not has_patch:
+                    print("HIT 8", flush=True)
                     report_json = {
                         data_point["instance_id"]: {
                             "resolved": False,
@@ -971,17 +979,20 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                     }
 
                 else:
+                    print("HIT 9", flush=True)
                     # Run full evaluation with streaming output
                     # TODO: should we fail on errors here? Seems that json isn't always generated
                     try:
                         start_time = asyncio.get_running_loop().time()
                         if data_point["dataset_name"] == "nv-internal-1":
+                            print("HIT 10", flush=True)
                             report_file = await self._run_nv_internal_eval(
                                 data_point,
                                 trajectory_dict["model_patch"],
                                 instance_dataset_path,
                             )
                         elif "R2E-Gym" in data_point["dataset_name"]:
+                            print("HIT 11", flush=True)
                             report_file = await self._run_r2e_gym_eval(
                                 pred_mounted_path,
                                 data_point,
@@ -989,6 +1000,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                                 instance_dataset_path,
                             )
                         else:
+                            print("HIT 12", flush=True)
                             report_file = await self._run_swebench_eval(
                                 pred_mounted_path,
                                 data_point,
@@ -997,6 +1009,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                             )
                         evaluation_time = asyncio.get_running_loop().time() - start_time
                     except ValueError:
+                        print("HIT 13", flush=True)
                         print(
                             f"Failed to execute SWE-bench evaluation command for {data_point['instance_id']}",
                             flush=True,
@@ -1012,10 +1025,13 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                         }
                         report_file = None
 
+                    print("HIT 14", flush=True)
                     if report_file is not None:
+                        print("HIT 15", flush=True)
                         with open(report_file, "r") as f:
                             report_json = json.loads(f.read().strip())
 
+            print("HIT 16", flush=True)
             output_dict = {
                 "swe-bench-metrics": report_json[data_point["instance_id"]],
                 "swe-bench-outputs": trajectory_dict,

From 3b911a77745f1d81d2a4ff22e9a58aa9978e1041 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 16:59:48 -0800
Subject: [PATCH 018/127] print hit

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index b8cd5604f..88ba0d368 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -685,6 +685,7 @@ async def run_swebench_evaluation(
         dataset_path=dataset_path,
     )
 
+    print("HIT before process_single_datapoint", problem_info, flush=True)
     result = await run_oh.process_single_datapoint(problem_info)
     print(f"Process completed for {instance_id}", flush=True)
 

From 6ba18ad5fde9403760fc964dc33eecd6ebf1dd13 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 17:04:21 -0800
Subject: [PATCH 019/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 .../swe_agents/run_openhands.py               | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index bcf677bec..16e30ec4f 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -909,7 +909,6 @@ def check_tests_passed(
         return required_tests <= passed_tests
 
     async def process_single_datapoint(self, data_point: dict[str, Any]):
-        print("HIT 1", flush=True)
         self.output_dir = Path(self.cfg.output_file).parent
 
         agent_run_id = f"{data_point['instance_id']}_{int(time.time())}_{str(uuid.uuid4())[:8]}"
@@ -922,14 +921,12 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
         trajectory_dict = None
         try:
             if self.cfg.agent_framework == SupportedAgentFrameworks.swe_agent:
-                print("HIT 2", flush=True)
                 pred_file = await self._run_swe_agent(
                     data_point,
                     api_base,
                     instance_dataset_path,
                 )
             elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands:
-                print("HIT 3", flush=True)
                 pred_file = await self._run_openhands(
                     data_point,
                     api_base,
@@ -937,17 +934,14 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                     instance_dataset_path,
                 )
             else:
-                print("HIT 4", flush=True)
                 raise ValueError(
                     f"Unsupported agent framework: {self.cfg.agent_framework}. "
                     f"Supported frameworks: {', '.join(SupportedAgentFrameworks)}."
                 )
 
-            print("HIT 5", flush=True)
             generation_time = asyncio.get_running_loop().time() - start_time
 
             if pred_file is None:
-                print("HIT 6", flush=True)
                 report_json = {
                     data_point["instance_id"]: {
                         "resolved": False,
@@ -958,7 +952,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                     }
                 }
             else:
-                print("HIT 7", flush=True)
                 pred_mounted_path = pred_file.replace(str(self.output_dir), "/trajectories_mount")
                 with open(pred_file, "r") as f:
                     trajectory_dict = json.loads(f.read())
@@ -967,7 +960,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                 has_patch = trajectory_dict["model_patch"] is not None
 
                 if not has_patch:
-                    print("HIT 8", flush=True)
                     report_json = {
                         data_point["instance_id"]: {
                             "resolved": False,
@@ -979,20 +971,17 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                     }
 
                 else:
-                    print("HIT 9", flush=True)
                     # Run full evaluation with streaming output
                     # TODO: should we fail on errors here? Seems that json isn't always generated
                     try:
                         start_time = asyncio.get_running_loop().time()
                         if data_point["dataset_name"] == "nv-internal-1":
-                            print("HIT 10", flush=True)
                             report_file = await self._run_nv_internal_eval(
                                 data_point,
                                 trajectory_dict["model_patch"],
                                 instance_dataset_path,
                             )
                         elif "R2E-Gym" in data_point["dataset_name"]:
-                            print("HIT 11", flush=True)
                             report_file = await self._run_r2e_gym_eval(
                                 pred_mounted_path,
                                 data_point,
@@ -1000,7 +989,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                                 instance_dataset_path,
                             )
                         else:
-                            print("HIT 12", flush=True)
                             report_file = await self._run_swebench_eval(
                                 pred_mounted_path,
                                 data_point,
@@ -1009,7 +997,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                             )
                         evaluation_time = asyncio.get_running_loop().time() - start_time
                     except ValueError:
-                        print("HIT 13", flush=True)
                         print(
                             f"Failed to execute SWE-bench evaluation command for {data_point['instance_id']}",
                             flush=True,
@@ -1025,13 +1012,10 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                         }
                         report_file = None
 
-                    print("HIT 14", flush=True)
                     if report_file is not None:
-                        print("HIT 15", flush=True)
                         with open(report_file, "r") as f:
                             report_json = json.loads(f.read().strip())
 
-            print("HIT 16", flush=True)
             output_dict = {
                 "swe-bench-metrics": report_json[data_point["instance_id"]],
                 "swe-bench-outputs": trajectory_dict,
@@ -1041,10 +1025,5 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
             }
 
             return output_dict
-        except:
-            # TODO remove
-            import traceback
-
-            print("Hit exception in process_single_datapoint", traceback.format_exc(), flush=True)
         finally:
             self._cleanup_instance_dataset(instance_dataset_path)

From 3726bba79ff4ab5a7cbc926e7c05eb44f2eb82cc Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 17:14:34 -0800
Subject: [PATCH 020/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py   | 10 ----------
 responses_api_agents/swe_agents/utils.py |  1 -
 2 files changed, 11 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 8bb414331..99ceac029 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -280,11 +280,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             )
 
         except Exception as e:
-            # TODO remove
-            import traceback
-
-            print(traceback.format_exc(), flush=True)
-
             print(f"SWE-bench evaluation failed: {str(e)}", flush=True)
             # Return error response
             error_message = NeMoGymResponseOutputMessage(
@@ -306,11 +301,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 tools=[],
                 metadata={"error": str(e)},
             )
-        except:
-            # TODO remove
-            import traceback
-
-            print(traceback.format_exc(), flush=True)
 
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
         """Run and verify SWE-bench solution."""
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 88ba0d368..b8cd5604f 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -685,7 +685,6 @@ async def run_swebench_evaluation(
         dataset_path=dataset_path,
     )
 
-    print("HIT before process_single_datapoint", problem_info, flush=True)
     result = await run_oh.process_single_datapoint(problem_info)
     print(f"Process completed for {instance_id}", flush=True)
 

From 7ae04fa9fa17663e70177c553e7f3c399ba18749 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 17:21:16 -0800
Subject: [PATCH 021/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 99ceac029..a1e1557aa 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -212,9 +212,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
             result = await future
 
-            # TODO remove
-            print("RESULT", result, flush=True)
-
             # Extract trajectory and convert to proper NeMoGym format
             output_items = []
             trajectory = result.get("trajectory", [])

From 78cce58d2f7d2302542b0fe8824d989dc73ff6b0 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Wed, 21 Jan 2026 18:03:26 -0800
Subject: [PATCH 022/127] feat: oh metrics block commands

Signed-off-by: Sugam Devare <sdevare@nvidia.com>
---
 responses_api_agents/swe_agents/app.py                      | 5 +++++
 .../swe_agents/configs/swebench_openhands.yaml              | 2 +-
 .../swe_agents/configs/swebench_openhands_training.yaml     | 4 ++--
 responses_api_agents/swe_agents/run_openhands.py            | 3 ++-
 responses_api_agents/swe_agents/utils.py                    | 6 ++++--
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index a1e1557aa..1c0c9b046 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -60,6 +60,8 @@
     },
 )
 def runner_ray_remote(runner: Callable, params: dict[str, Any]) -> Any:
+    ray_submit_time = time.time()
+    params["ray_submit_time"] = ray_submit_time
     return asyncio.run(runner(**params))
 
 
@@ -173,6 +175,7 @@ def model_post_init(self, __context: Any) -> None:
         print("Dependencies repositories set up complete", flush=True)
 
         self.config.run_session_id = f"{int(time.time() * 1000)}_{str(uuid.uuid4())[:8]}"
+        print(f"Run session ID: {self.config.run_session_id}", flush=True)
 
     async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()) -> NeMoGymResponse:
         # Extract problem information from request
@@ -189,6 +192,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             f"{problem_info.get('instance_id', 'unknown')}_{int(time.time() * 1000)}_{str(uuid.uuid4())[:8]}"
         )
         try:
+            ray_queue_time = time.time()
             params = {
                 "problem_info": problem_info,
                 "model_endpoint": model_endpoint,
@@ -207,6 +211,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "r2e_gym_setup_dir": self.config.r2e_gym_setup_dir,
                 "dataset_path": self.config.dataset_path,
                 "instance_dir": instance_dir,
+                "ray_queue_time": ray_queue_time,
             }
 
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
index 5e29b3968..eb4a57583 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
@@ -9,7 +9,7 @@ swe_agents:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 7af10584eb623e6d50a616d3c3c967d7d4fb3690  # pragma: allowlist secret
+      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
       
       # Container configuration
       container_formatter: ???
diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
index 7e6eacda0..d39fac5e1 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
@@ -8,7 +8,7 @@ swe_agents_train:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 7af10584eb623e6d50a616d3c3c967d7d4fb3690  # pragma: allowlist secret
+      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
       # Container configuration
       container_formatter: ???
       container_folder_path: null
@@ -39,7 +39,7 @@ swe_agents_val:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 7af10584eb623e6d50a616d3c3c967d7d4fb3690  # pragma: allowlist secret
+      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
       # Container configuration
       container_formatter: ???
       container_folder_path: null
diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 16e30ec4f..1eb4e3313 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -353,6 +353,7 @@ async def _run_openhands(
                             "model_name_or_path": out_dict["metadata"]["llm_config"]["model"],
                             "instance_id": out_dict["instance_id"],
                             "model_patch": patch + "\n" if patch and not patch.endswith("\n") else patch,
+                            "oh_time_metrics": out_dict["metrics"],
                         }
                     )
                 )
@@ -1018,7 +1019,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
 
             output_dict = {
                 "swe-bench-metrics": report_json[data_point["instance_id"]],
-                "swe-bench-outputs": trajectory_dict,
+                "oh_time_metrics": trajectory_dict.get("oh_time_metrics", None) if trajectory_dict else {},
                 "generation": "",  # required TODO: we should fix this
                 "generation_time": generation_time,
                 "evaluation_time": evaluation_time,
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 376b4f153..e60df2ea9 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -639,6 +639,8 @@ async def run_swebench_evaluation(
     r2e_gym_setup_dir: Optional[Path] = None,
     dataset_path: Optional[str] = None,
     instance_dir: Optional[str] = None,
+    ray_queue_time: Optional[float] = None,
+    ray_submit_time: Optional[float] = None,
 ) -> Dict:
     # Create persistent directory for I/O and logs in local workspace
     workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
@@ -687,6 +689,8 @@ async def run_swebench_evaluation(
     result = await run_oh.process_single_datapoint(problem_info)
     print(f"Process completed for {instance_id}", flush=True)
 
+    result["oh_time_metrics"]["ray_time_in_queue"] = ray_submit_time - ray_queue_time
+
     try:
         with open(output_file, "w") as f:
             json.dump(result, f)
@@ -707,8 +711,6 @@ async def run_swebench_evaluation(
         agent_tools_file if agent_framework == "swe_agent" else None,
     )
 
-    # tools = convert_tools_to_function_format(tools) if tools else []
-
     result["tools"] = tools
     result["trajectory"] = trajectory_data
 

From 2dac87105bbd5593e9bccbd42578e541539e1519 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 22:12:07 -0800
Subject: [PATCH 023/127] try cpus 0.5; print num containers in parallel

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 34 +++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 1c0c9b046..87aed81ee 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -53,16 +53,40 @@
 )
 
 
+@ray.remote
+class ConcurrentContainerCounter:
+    def __init__(self):
+        self.concurrent_containers = 0
+
+    def increment(self):
+        self.counter += 1
+        return self.counter
+
+    def decrement(self):
+        self.counter += 1
+        return self.counter
+
+
 @ray.remote(
     scheduling_strategy="SPREAD",
     runtime_env={
         "py_executable": sys.executable,
     },
+    num_cpus=0.5,
 )
-def runner_ray_remote(runner: Callable, params: dict[str, Any]) -> Any:
+def runner_ray_remote(
+    concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any]
+) -> Any:
+    concurrent_containers = concurrent_container_counter.increment.remote()
+    print(f"Concurrent container #{concurrent_containers}", flush=True)
+
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time
-    return asyncio.run(runner(**params))
+    result = asyncio.run(runner(**params))
+
+    concurrent_container_counter.decrement.remote()
+
+    return result
 
 
 class SWEBenchWrapperConfig(BaseResponsesAPIAgentConfig):
@@ -162,6 +186,7 @@ class SWEBenchWrapper(SimpleResponsesAPIAgent):
 
     def model_post_init(self, __context: Any) -> None:
         self.sem = Semaphore(self.config.concurrency)
+        self.container_counter = ConcurrentContainerCounter.remote()
 
         # Pre-build OpenHands environment if using openhands framework
         if self.config.agent_framework == "openhands":
@@ -214,7 +239,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "ray_queue_time": ray_queue_time,
             }
 
-            future = runner_ray_remote.remote(run_swebench_evaluation, params)
+            future = runner_ray_remote.remote(self.container_counter, run_swebench_evaluation, params)
             result = await future
 
             # Extract trajectory and convert to proper NeMoGym format
@@ -307,6 +332,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
         """Run and verify SWE-bench solution."""
         async with self.sem:
+            print(f"Semaphore: {self.config.concurrency - self.sem._value} / {self.config.concurrency}", flush=True)
+            body.responses_create_params.metadata["container_concurrency"] = self.config.concurrency - self.sem._value
+
             # Fix None values in responses_create_params to use defaults
             # This is needed because the pydantic model has non-Optional fields with defaults
 

From be3f4c284dd8ce00825f22b38e48ed1f937b41db Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 22:15:19 -0800
Subject: [PATCH 024/127] try add container counter param

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 87aed81ee..dbbd1f2c0 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -182,6 +182,7 @@ class SWEBenchWrapper(SimpleResponsesAPIAgent):
 
     config: SWEBenchWrapperConfig
     sem: Semaphore = None
+    container_counter: ConcurrentContainerCounter = None
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     def model_post_init(self, __context: Any) -> None:

From 98efc9f6fb489477bf8ae7cc5ccce26c52500899 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Wed, 21 Jan 2026 22:17:21 -0800
Subject: [PATCH 025/127] use private

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index dbbd1f2c0..d78323263 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -182,12 +182,12 @@ class SWEBenchWrapper(SimpleResponsesAPIAgent):
 
     config: SWEBenchWrapperConfig
     sem: Semaphore = None
-    container_counter: ConcurrentContainerCounter = None
+    _container_counter: ConcurrentContainerCounter = None
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     def model_post_init(self, __context: Any) -> None:
         self.sem = Semaphore(self.config.concurrency)
-        self.container_counter = ConcurrentContainerCounter.remote()
+        self._container_counter = ConcurrentContainerCounter.remote()
 
         # Pre-build OpenHands environment if using openhands framework
         if self.config.agent_framework == "openhands":
@@ -240,7 +240,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "ray_queue_time": ray_queue_time,
             }
 
-            future = runner_ray_remote.remote(self.container_counter, run_swebench_evaluation, params)
+            future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params)
             result = await future
 
             # Extract trajectory and convert to proper NeMoGym format

From 1709e9ed030fcd52dcf1bfa9228beafc03585230 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 09:03:44 -0800
Subject: [PATCH 026/127] ray get

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index d78323263..c273cd3d2 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -77,14 +77,14 @@ def decrement(self):
 def runner_ray_remote(
     concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any]
 ) -> Any:
-    concurrent_containers = concurrent_container_counter.increment.remote()
+    concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
     print(f"Concurrent container #{concurrent_containers}", flush=True)
 
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time
     result = asyncio.run(runner(**params))
 
-    concurrent_container_counter.decrement.remote()
+    ray.get(concurrent_container_counter.decrement.remote())
 
     return result
 

From a4ed09d9beb451822218a1d2382a793586cc852d Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 09:14:23 -0800
Subject: [PATCH 027/127] fix

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index c273cd3d2..cf79738dc 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -59,12 +59,12 @@ def __init__(self):
         self.concurrent_containers = 0
 
     def increment(self):
-        self.counter += 1
-        return self.counter
+        self.concurrent_containers += 1
+        return self.concurrent_containers
 
     def decrement(self):
-        self.counter += 1
-        return self.counter
+        self.concurrent_containers -= 1
+        return self.concurrent_containers
 
 
 @ray.remote(

From f912c54bf0dfb6676805643368843dbe20c8b002 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 12:16:22 -0800
Subject: [PATCH 028/127] print usage

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_models/vllm_model/app.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
index 46319303d..16d13edb1 100644
--- a/responses_api_models/vllm_model/app.py
+++ b/responses_api_models/vllm_model/app.py
@@ -251,6 +251,9 @@ async def chat_completions(
             else:
                 raise e
 
+        # TODO remove
+        print(chat_completion_dict["usage"])
+
         choice_dict = chat_completion_dict["choices"][0]
         if self.config.uses_reasoning_parser:
             reasoning_content = choice_dict["message"].get("reasoning_content")

From 5ffe4487adc95c36a1fdd5ac22822b9a4091bf2d Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 12:18:59 -0800
Subject: [PATCH 029/127] flush

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_models/vllm_model/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
index 16d13edb1..30bb7a76f 100644
--- a/responses_api_models/vllm_model/app.py
+++ b/responses_api_models/vllm_model/app.py
@@ -252,7 +252,7 @@ async def chat_completions(
                 raise e
 
         # TODO remove
-        print(chat_completion_dict["usage"])
+        print(chat_completion_dict["usage"], flush=True)
 
         choice_dict = chat_completion_dict["choices"][0]
         if self.config.uses_reasoning_parser:

From f9450a20ab5e745659c870553758e9bd97e90b0c Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 12:28:15 -0800
Subject: [PATCH 030/127] try disable

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/rollout_collection.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_gym/rollout_collection.py b/nemo_gym/rollout_collection.py
index 8d2db4556..4c11e81f3 100644
--- a/nemo_gym/rollout_collection.py
+++ b/nemo_gym/rollout_collection.py
@@ -150,8 +150,9 @@ async def _post_subroutine(row: Dict) -> Tuple[Dict, Dict]:
             await raise_for_status(res)
             return row, await get_response_json(res)
 
+        # TODO revert disable=True
         return tqdm.as_completed(
-            map(_post_subroutine, examples), desc="Collecting rollouts", miniters=10, total=len(examples)
+            map(_post_subroutine, examples), desc="Collecting rollouts", miniters=10, total=len(examples), disable=True
         )
 
     def setup_server_client(self, head_server_config: Optional[BaseServerConfig] = None) -> ServerClient:

From 8575e3681e4ed75fd4aafb13eda4e0a233c7cda5 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 12:43:55 -0800
Subject: [PATCH 031/127] revert prefix server logs

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/server_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index 58ae1337f..99d1c21b9 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -576,7 +576,8 @@ def run_webserver(cls) -> FastAPI:  # pragma: no cover
 
         app = server.setup_webserver()
         server.set_ulimit()
-        server.prefix_server_logs()
+        # TODO remove
+        # server.prefix_server_logs()
         server.setup_exception_middleware(app)
 
         @app.exception_handler(RequestValidationError)

From 5f7079d3f9251e0ac100592cd1ff4d1189b4fd68 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:05:26 -0800
Subject: [PATCH 032/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/server_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index 99d1c21b9..58ae1337f 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -576,8 +576,7 @@ def run_webserver(cls) -> FastAPI:  # pragma: no cover
 
         app = server.setup_webserver()
         server.set_ulimit()
-        # TODO remove
-        # server.prefix_server_logs()
+        server.prefix_server_logs()
         server.setup_exception_middleware(app)
 
         @app.exception_handler(RequestValidationError)

From 5403fbc5c41b8bdc86f4369f06d98b0735164616 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:05:49 -0800
Subject: [PATCH 033/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/rollout_collection.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nemo_gym/rollout_collection.py b/nemo_gym/rollout_collection.py
index 4c11e81f3..8d2db4556 100644
--- a/nemo_gym/rollout_collection.py
+++ b/nemo_gym/rollout_collection.py
@@ -150,9 +150,8 @@ async def _post_subroutine(row: Dict) -> Tuple[Dict, Dict]:
             await raise_for_status(res)
             return row, await get_response_json(res)
 
-        # TODO revert disable=True
         return tqdm.as_completed(
-            map(_post_subroutine, examples), desc="Collecting rollouts", miniters=10, total=len(examples), disable=True
+            map(_post_subroutine, examples), desc="Collecting rollouts", miniters=10, total=len(examples)
         )
 
     def setup_server_client(self, head_server_config: Optional[BaseServerConfig] = None) -> ServerClient:

From 8fc2ebd6f9b62eb4e20f6bbd4683d9785584007c Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:06:18 -0800
Subject: [PATCH 034/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_models/vllm_model/app.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
index 30bb7a76f..46319303d 100644
--- a/responses_api_models/vllm_model/app.py
+++ b/responses_api_models/vllm_model/app.py
@@ -251,9 +251,6 @@ async def chat_completions(
             else:
                 raise e
 
-        # TODO remove
-        print(chat_completion_dict["usage"], flush=True)
-
         choice_dict = chat_completion_dict["choices"][0]
         if self.config.uses_reasoning_parser:
             reasoning_content = choice_dict["message"].get("reasoning_content")

From ba6153c4b215b4d3499fdedc12217654cbb5b860 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:10:50 -0800
Subject: [PATCH 035/127] try logger warning

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index cf79738dc..122a31b7c 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -78,7 +78,10 @@ def runner_ray_remote(
     concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any]
 ) -> Any:
     concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
-    print(f"Concurrent container #{concurrent_containers}", flush=True)
+
+    from logging import getLogger
+
+    getLogger().warning(f"Concurrent container #{concurrent_containers}")
 
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time

From f56c36f196620b8e49476312b785d0c54ac77fc5 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:12:31 -0800
Subject: [PATCH 036/127] print usage again

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_models/vllm_model/app.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
index 46319303d..30bb7a76f 100644
--- a/responses_api_models/vllm_model/app.py
+++ b/responses_api_models/vllm_model/app.py
@@ -251,6 +251,9 @@ async def chat_completions(
             else:
                 raise e
 
+        # TODO remove
+        print(chat_completion_dict["usage"], flush=True)
+
         choice_dict = chat_completion_dict["choices"][0]
         if self.config.uses_reasoning_parser:
             reasoning_content = choice_dict["message"].get("reasoning_content")

From 49c7a9074895e6da48fe1acd2b48e7fd91d539ab Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:16:27 -0800
Subject: [PATCH 037/127] try info

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 122a31b7c..cd1da0074 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -81,7 +81,7 @@ def runner_ray_remote(
 
     from logging import getLogger
 
-    getLogger().warning(f"Concurrent container #{concurrent_containers}")
+    getLogger().info(f"Concurrent container #{concurrent_containers}")
 
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time

From d1bda2def99fcc0794c4e3fc3f437a6eff6a6ff5 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:25:56 -0800
Subject: [PATCH 038/127] try info into warning

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index cd1da0074..65a3bfc5d 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -81,7 +81,8 @@ def runner_ray_remote(
 
     from logging import getLogger
 
-    getLogger().info(f"Concurrent container #{concurrent_containers}")
+    getLogger().info(f"Concurrent container #{concurrent_containers} info")
+    getLogger().warning(f"Concurrent container #{concurrent_containers} warning")
 
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time

From dbd49ded9e418c9b63167cc3035beca7ec9fb905 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:37:20 -0800
Subject: [PATCH 039/127] try redirect stdout

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 65a3bfc5d..d1e34ca9b 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -409,4 +409,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
 
 if __name__ == "__main__":
-    SWEBenchWrapper.run_webserver()
+    import sys
+    from contextlib import redirect_stdout
+
+    with redirect_stdout(sys.stderr):
+        SWEBenchWrapper.run_webserver()

From 3025ad62929170d552a3891b55054f2e197ed284 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:38:47 -0800
Subject: [PATCH 040/127] redirect inside too

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index d1e34ca9b..cf8c458b7 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -79,9 +79,13 @@ def runner_ray_remote(
 ) -> Any:
     concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
 
+    import sys
+    from contextlib import redirect_stdout
     from logging import getLogger
 
-    getLogger().info(f"Concurrent container #{concurrent_containers} info")
+    with redirect_stdout(sys.stderr):
+        getLogger().info(f"Concurrent container #{concurrent_containers} info")
+
     getLogger().warning(f"Concurrent container #{concurrent_containers} warning")
 
     ray_submit_time = time.time()

From e82a6af1692a8394f565433ae9f466aabed547c0 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:40:50 -0800
Subject: [PATCH 041/127] set log level

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index cf8c458b7..2c73d808a 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -79,14 +79,13 @@ def runner_ray_remote(
 ) -> Any:
     concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
 
-    import sys
-    from contextlib import redirect_stdout
-    from logging import getLogger
-
-    with redirect_stdout(sys.stderr):
-        getLogger().info(f"Concurrent container #{concurrent_containers} info")
+    from logging import DEBUG, getLogger
 
-    getLogger().warning(f"Concurrent container #{concurrent_containers} warning")
+    # with redirect_stdout(sys.stderr):
+    logger = getLogger()
+    logger.setLevel(DEBUG)
+    logger.info(f"Concurrent container #{concurrent_containers} info")
+    logger.warning(f"Concurrent container #{concurrent_containers} warning")
 
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time

From ee0af2f64993e2564ae3911d3e30904070fffd33 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:42:09 -0800
Subject: [PATCH 042/127] try redirect

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 2c73d808a..7152bc4a8 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -79,12 +79,14 @@ def runner_ray_remote(
 ) -> Any:
     concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
 
+    import sys
+    from contextlib import redirect_stdout
     from logging import DEBUG, getLogger
 
-    # with redirect_stdout(sys.stderr):
     logger = getLogger()
     logger.setLevel(DEBUG)
-    logger.info(f"Concurrent container #{concurrent_containers} info")
+    with redirect_stdout(sys.stderr):
+        logger.info(f"Concurrent container #{concurrent_containers} info")
     logger.warning(f"Concurrent container #{concurrent_containers} warning")
 
     ray_submit_time = time.time()

From 843b0cb0984c9d7efec48dcd6e0cef1373092f7e Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:46:29 -0800
Subject: [PATCH 043/127] try print with file

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 7152bc4a8..fdc060bb6 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -80,14 +80,11 @@ def runner_ray_remote(
     concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
 
     import sys
-    from contextlib import redirect_stdout
-    from logging import DEBUG, getLogger
+    from logging import getLogger
 
-    logger = getLogger()
-    logger.setLevel(DEBUG)
-    with redirect_stdout(sys.stderr):
-        logger.info(f"Concurrent container #{concurrent_containers} info")
-    logger.warning(f"Concurrent container #{concurrent_containers} warning")
+    print(f"Concurrent container #{concurrent_containers} print file sys.stderr", file=sys.stderr)
+    print(f"Concurrent container #{concurrent_containers} print file default")
+    getLogger().warning(f"Concurrent container #{concurrent_containers} warning")
 
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time
@@ -414,8 +411,4 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
 
 if __name__ == "__main__":
-    import sys
-    from contextlib import redirect_stdout
-
-    with redirect_stdout(sys.stderr):
-        SWEBenchWrapper.run_webserver()
+    SWEBenchWrapper.run_webserver()

From 7ef8b0791c416d2eaa13acb2e663a2149a418463 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:47:43 -0800
Subject: [PATCH 044/127] try print again

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index fdc060bb6..0a8d99f03 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -80,10 +80,13 @@ def runner_ray_remote(
     concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
 
     import sys
+    from contextlib import redirect_stdout
     from logging import getLogger
 
     print(f"Concurrent container #{concurrent_containers} print file sys.stderr", file=sys.stderr)
     print(f"Concurrent container #{concurrent_containers} print file default")
+    with redirect_stdout(sys.stderr):
+        print(f"Concurrent container #{concurrent_containers} print file redirect stdout to stderr")
     getLogger().warning(f"Concurrent container #{concurrent_containers} warning")
 
     ray_submit_time = time.time()

From eb9baffcd317ad95d5310a4b1c865fbd20176ce1 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 13:56:21 -0800
Subject: [PATCH 045/127] wrap entire call

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 0a8d99f03..cd9443378 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -17,6 +17,7 @@
 import time
 import uuid
 from asyncio import Semaphore
+from contextlib import redirect_stdout
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional
 
@@ -79,19 +80,12 @@ def runner_ray_remote(
 ) -> Any:
     concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
 
-    import sys
-    from contextlib import redirect_stdout
-    from logging import getLogger
-
-    print(f"Concurrent container #{concurrent_containers} print file sys.stderr", file=sys.stderr)
-    print(f"Concurrent container #{concurrent_containers} print file default")
-    with redirect_stdout(sys.stderr):
-        print(f"Concurrent container #{concurrent_containers} print file redirect stdout to stderr")
-    getLogger().warning(f"Concurrent container #{concurrent_containers} warning")
-
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time
-    result = asyncio.run(runner(**params))
+
+    with redirect_stdout(sys.stderr):
+        print(f"Concurrent container #{concurrent_containers}")
+        result = asyncio.run(runner(**params))
 
     ray.get(concurrent_container_counter.decrement.remote())
 

From 516b66fcb78aae49ebea26c6f18073490d6081ed Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 14:00:25 -0800
Subject: [PATCH 046/127] just use std err

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index cd9443378..8338f628f 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -17,7 +17,6 @@
 import time
 import uuid
 from asyncio import Semaphore
-from contextlib import redirect_stdout
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional
 
@@ -79,13 +78,12 @@ def runner_ray_remote(
     concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any]
 ) -> Any:
     concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
+    print(f"Concurrent container #{concurrent_containers}", file=sys.stderr)
 
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time
 
-    with redirect_stdout(sys.stderr):
-        print(f"Concurrent container #{concurrent_containers}")
-        result = asyncio.run(runner(**params))
+    result = asyncio.run(runner(**params))
 
     ray.get(concurrent_container_counter.decrement.remote())
 

From 5a0c1a2953937669b51ff813945d6704c9dd48f1 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 16:31:08 -0800
Subject: [PATCH 047/127] try impl dump graph

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/server_utils.py | 17 +++++++++++++++--
 pyproject.toml           |  3 +++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index 58ae1337f..e942cb489 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -48,8 +48,10 @@
 from fastapi.exception_handlers import request_validation_exception_handler
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
+from gprof2dot import main as gprof2dot_main
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pydantic import BaseModel, ConfigDict
+from pydot import graph_from_dot_file
 from requests.exceptions import ConnectionError
 from starlette.middleware.sessions import SessionMiddleware
 
@@ -452,14 +454,25 @@ async def exception_handling_middleware(request: Request, call_next):
                 return JSONResponse(content="An unknown error occurred", status_code=500)
 
     def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareConfig) -> None:  # pragma: no cover
-        base_profile_dir = PARENT_DIR / profiling_config.profiling_results_dirpath
-        server_profile_path = (base_profile_dir / self.get_session_middleware_key()).with_suffix(".log")
+        base_profile_dir = PARENT_DIR / profiling_config.profiling_results_dirpath / self.get_session_middleware_key()
+        server_profile_path = base_profile_dir / "yappi.log"
+        callgrind_path = base_profile_dir / "yappi.callgrind"
+        callgrind_dotfile_path = base_profile_dir / "yappi.dot"
+        callgrind_graph_path = base_profile_dir / "yappi.png"
 
         base_profile_dir.mkdir(parents=True, exist_ok=True)
 
         main_app_lifespan = app.router.lifespan_context
 
         def _dump_yappi_stats() -> str:
+            yappi.get_func_stats().save(callgrind_path, type="CALLGRIND")
+            gprof2dot_main(
+                argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()
+            )
+
+            (graph,) = graph_from_dot_file(callgrind_dotfile_path)
+            graph.write_png(callgrind_graph_path)
+
             buffer = StringIO()
             yappi.get_func_stats().print_all(
                 out=buffer,
diff --git a/pyproject.toml b/pyproject.toml
index 522d4645a..038c338e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -158,6 +158,9 @@ dependencies = [
     # Updated: Thu Jan 08, 2026 with orjson==3.11.3
     # License: Apache 2.0 https://github.com/ijl/orjson/blob/fb3eb1f729c7e7b019f780af5695722c99c7c695/LICENSE-APACHE
     "orjson",
+
+    "gprof2dot",
+    "pydot",
 ]
 
 [dependency-groups]

From c029cb6fffa531c911a5c9d4a0b295c5651fba59 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 16:57:44 -0800
Subject: [PATCH 048/127] add print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/server_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index e942cb489..70b577b7d 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -603,6 +603,7 @@ async def validation_exception_handler(request: Request, exc):
 
         profiling_config = ProfilingMiddlewareConfig.model_validate(global_config_dict)
         if profiling_config.profiling_enabled:
+            print(f"Enabled profiling for {server.config.name}")
             server.setup_profiling(app, profiling_config)
 
         uvicorn_logging_cfg = UvicornLoggingConfig.model_validate(global_config_dict)

From d1f72de0005b1aa63b844ccfadd0a4f99c0dcdd6 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 17:16:55 -0800
Subject: [PATCH 049/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/server_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index 70b577b7d..e942cb489 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -603,7 +603,6 @@ async def validation_exception_handler(request: Request, exc):
 
         profiling_config = ProfilingMiddlewareConfig.model_validate(global_config_dict)
         if profiling_config.profiling_enabled:
-            print(f"Enabled profiling for {server.config.name}")
             server.setup_profiling(app, profiling_config)
 
         uvicorn_logging_cfg = UvicornLoggingConfig.model_validate(global_config_dict)

From cc67ad3418b93a324c45f6efe766d4fb11986901 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 17:19:05 -0800
Subject: [PATCH 050/127] add prints

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/server_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index e942cb489..aff9eb36b 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -465,12 +465,17 @@ def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareCon
         main_app_lifespan = app.router.lifespan_context
 
         def _dump_yappi_stats() -> str:
+            # TODO remove
+            print("yappi get func stats", file=sys.stderr)
             yappi.get_func_stats().save(callgrind_path, type="CALLGRIND")
+            print("gprof2dot_main", file=sys.stderr)
             gprof2dot_main(
                 argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()
             )
 
+            print("graph_from_dot_file", file=sys.stderr)
             (graph,) = graph_from_dot_file(callgrind_dotfile_path)
+            print("graph.write_png", file=sys.stderr)
             graph.write_png(callgrind_graph_path)
 
             buffer = StringIO()

From b085a140929d3482c6ec3d6eb69b1be2853456bc Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 17:31:11 -0800
Subject: [PATCH 051/127] try timeout and kill

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/cli.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/nemo_gym/cli.py b/nemo_gym/cli.py
index af3558253..ec0f9c8a6 100644
--- a/nemo_gym/cli.py
+++ b/nemo_gym/cli.py
@@ -25,7 +25,7 @@
 from os.path import exists
 from pathlib import Path
 from signal import SIGINT
-from subprocess import Popen
+from subprocess import Popen, TimeoutExpired
 from threading import Thread
 from time import sleep, time
 from typing import Dict, List, Optional, Tuple
@@ -343,8 +343,12 @@ def shutdown(self) -> None:
             process.send_signal(SIGINT)
 
         print("Waiting for processes to finish...")
-        for process in self._processes.values():
-            process.wait()
+        for top_level_path, process in self._processes.items():
+            try:
+                process.wait(timeout=5)
+            except TimeoutExpired:
+                print(f"Waiting for process {top_level_path} timed out. Killing process instead.")
+                process.kill()
 
         self._processes = dict()
 

From ea35354c231ad27bc62a6dde2053d4edb0682f31 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 17:35:04 -0800
Subject: [PATCH 052/127] revert

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/cli.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/nemo_gym/cli.py b/nemo_gym/cli.py
index ec0f9c8a6..af3558253 100644
--- a/nemo_gym/cli.py
+++ b/nemo_gym/cli.py
@@ -25,7 +25,7 @@
 from os.path import exists
 from pathlib import Path
 from signal import SIGINT
-from subprocess import Popen, TimeoutExpired
+from subprocess import Popen
 from threading import Thread
 from time import sleep, time
 from typing import Dict, List, Optional, Tuple
@@ -343,12 +343,8 @@ def shutdown(self) -> None:
             process.send_signal(SIGINT)
 
         print("Waiting for processes to finish...")
-        for top_level_path, process in self._processes.items():
-            try:
-                process.wait(timeout=5)
-            except TimeoutExpired:
-                print(f"Waiting for process {top_level_path} timed out. Killing process instead.")
-                process.kill()
+        for process in self._processes.values():
+            process.wait()
 
         self._processes = dict()
 

From 946b8dc5da6c7f6bc71900c6eed9bbbefc684b77 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 17:46:35 -0800
Subject: [PATCH 053/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/server_utils.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index aff9eb36b..e942cb489 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -465,17 +465,12 @@ def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareCon
         main_app_lifespan = app.router.lifespan_context
 
         def _dump_yappi_stats() -> str:
-            # TODO remove
-            print("yappi get func stats", file=sys.stderr)
             yappi.get_func_stats().save(callgrind_path, type="CALLGRIND")
-            print("gprof2dot_main", file=sys.stderr)
             gprof2dot_main(
                 argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()
             )
 
-            print("graph_from_dot_file", file=sys.stderr)
             (graph,) = graph_from_dot_file(callgrind_dotfile_path)
-            print("graph.write_png", file=sys.stderr)
             graph.write_png(callgrind_graph_path)
 
             buffer = StringIO()

From 1e2dad11e04eb7abcfcaa05f5231f5ca3cbda2ce Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 20:33:48 -0800
Subject: [PATCH 054/127] simplify get global config dict

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index fbaa478f9..bef135502 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -23,6 +23,7 @@
 
 from openai.types.responses.function_tool import FunctionTool
 
+from nemo_gym.global_config import get_global_config_dict
 from nemo_gym.openai_utils import (
     NeMoGymEasyInputMessage,
     NeMoGymFunctionCallOutput,
@@ -33,7 +34,7 @@
     NeMoGymResponseOutputMessageForTraining,
     NeMoGymResponseOutputText,
 )
-from nemo_gym.server_utils import ServerClient, get_first_server_config_dict
+from nemo_gym.server_utils import get_first_server_config_dict
 from responses_api_agents.swe_agents.run_openhands import (
     RunOpenHandsAgent,
     SupportedAgentFrameworks,
@@ -610,7 +611,7 @@ def extract_problem_info(
 
 
 def get_model_endpoint(model_server_name: str) -> str:
-    global_config_dict = ServerClient.load_from_global_config().global_config_dict
+    global_config_dict = get_global_config_dict()
 
     model_server_config = get_first_server_config_dict(
         global_config_dict,

From 826310f5e1f6a3b5eaf0f6698137acb95dc0016f Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 20:35:16 -0800
Subject: [PATCH 055/127] try fix serialization error

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index bef135502..537824dca 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -399,7 +399,7 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List:
                 parameters=func_def.get("parameters"),
                 strict=func_def.get("strict"),  # May be None
             )
-            tools.append(function_tool)
+            tools.append(function_tool.model_dump())
     return tools
 
 

From f893f096425637133db5861de85cafeab091e0e3 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 20:51:31 -0800
Subject: [PATCH 056/127] try refactor into profiler

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/profiling.py    | 60 ++++++++++++++++++++++++++++++++++++++++
 nemo_gym/server_utils.py | 58 ++++----------------------------------
 2 files changed, 66 insertions(+), 52 deletions(-)
 create mode 100644 nemo_gym/profiling.py

diff --git a/nemo_gym/profiling.py b/nemo_gym/profiling.py
new file mode 100644
index 000000000..91b6cb9b3
--- /dev/null
+++ b/nemo_gym/profiling.py
@@ -0,0 +1,60 @@
+from io import StringIO
+from pathlib import Path
+
+import yappi
+from gprof2dot import main as gprof2dot_main
+from pydantic import BaseModel
+from pydot import graph_from_dot_file
+
+
+class Profiler(BaseModel):
+    name: str
+    base_profile_dir: Path
+
+    def start(self) -> None:
+        yappi.set_clock_type("CPU")
+        yappi.start()
+        print(f"🔍 Enabled profiling for {self.name}")
+
+    def stop(self) -> None:
+        print(f"🛑 Stopping profiler for {self.name}. Check {self.base_profile_dir} for the metrics!")
+        yappi.stop()
+        self.dump()
+
+    def dump(self) -> None:
+        self.base_profile_dir.mkdir(parents=True, exist_ok=True)
+        log_path = self.base_profile_dir / "yappi.log"
+        callgrind_path = self.base_profile_dir / "yappi.callgrind"
+        callgrind_dotfile_path = self.base_profile_dir / "yappi.dot"
+        callgrind_graph_path = self.base_profile_dir / "yappi.png"
+
+        yappi.get_func_stats().save(callgrind_path, type="CALLGRIND")
+        gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split())
+
+        (graph,) = graph_from_dot_file(callgrind_dotfile_path)
+        graph.write_png(callgrind_graph_path)
+
+        buffer = StringIO()
+        yappi.get_func_stats().print_all(
+            out=buffer,
+            columns={
+                0: ("name", 200),
+                1: ("ncall", 10),
+                2: ("tsub", 8),
+                3: ("ttot", 8),
+                4: ("tavg", 8),
+            },
+        )
+
+        buffer.seek(0)
+        res = ""
+        past_header = False
+        for line in buffer:
+            if not past_header or self.config.entrypoint in line:
+                res += line
+
+            if line.startswith("name"):
+                past_header = True
+
+        with open(log_path, "w") as f:
+            f.write(res)
diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py
index e942cb489..a687376dd 100644
--- a/nemo_gym/server_utils.py
+++ b/nemo_gym/server_utils.py
@@ -19,7 +19,6 @@
 import sys
 from abc import abstractmethod
 from contextlib import asynccontextmanager
-from io import StringIO
 from logging import Filter as LoggingFilter
 from logging import LogRecord, getLogger
 from os import environ, getenv
@@ -33,7 +32,6 @@
 import ray
 import requests
 import uvicorn
-import yappi
 from aiohttp import (
     ClientResponse,
     ClientResponseError,
@@ -48,10 +46,8 @@
 from fastapi.exception_handlers import request_validation_exception_handler
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
-from gprof2dot import main as gprof2dot_main
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pydantic import BaseModel, ConfigDict
-from pydot import graph_from_dot_file
 from requests.exceptions import ConnectionError
 from starlette.middleware.sessions import SessionMiddleware
 
@@ -69,6 +65,7 @@
     get_first_server_config_dict,
     get_global_config_dict,
 )
+from nemo_gym.profiling import Profiler
 
 
 _GLOBAL_AIOHTTP_CLIENT: Union[None, ClientSession] = None
@@ -455,68 +452,25 @@ async def exception_handling_middleware(request: Request, call_next):
 
     def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareConfig) -> None:  # pragma: no cover
         base_profile_dir = PARENT_DIR / profiling_config.profiling_results_dirpath / self.get_session_middleware_key()
-        server_profile_path = base_profile_dir / "yappi.log"
-        callgrind_path = base_profile_dir / "yappi.callgrind"
-        callgrind_dotfile_path = base_profile_dir / "yappi.dot"
-        callgrind_graph_path = base_profile_dir / "yappi.png"
-
-        base_profile_dir.mkdir(parents=True, exist_ok=True)
+        profiler = Profiler(name=self.config.name, base_profile_dir=base_profile_dir)
 
         main_app_lifespan = app.router.lifespan_context
 
-        def _dump_yappi_stats() -> str:
-            yappi.get_func_stats().save(callgrind_path, type="CALLGRIND")
-            gprof2dot_main(
-                argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()
-            )
-
-            (graph,) = graph_from_dot_file(callgrind_dotfile_path)
-            graph.write_png(callgrind_graph_path)
-
-            buffer = StringIO()
-            yappi.get_func_stats().print_all(
-                out=buffer,
-                columns={
-                    0: ("name", 200),
-                    1: ("ncall", 10),
-                    2: ("tsub", 8),
-                    3: ("ttot", 8),
-                    4: ("tavg", 8),
-                },
-            )
-
-            buffer.seek(0)
-            res = ""
-            past_header = False
-            for line in buffer:
-                if not past_header or self.config.entrypoint in line:
-                    res += line
-
-                if line.startswith("name"):
-                    past_header = True
-
-            return res
-
         @asynccontextmanager
         async def lifespan_wrapper(app):
-            yappi.set_clock_type("CPU")
-            yappi.start()
-            print(f"🔍 Enabled profiling for {self.config.name}")
+            profiler.start()
 
             async with main_app_lifespan(app) as maybe_state:
                 yield maybe_state
 
-            print(f"🛑 Stopping profiler for {self.config.name}. Check {server_profile_path} for the metrics!")
-            yappi.stop()
-
-            with open(server_profile_path, "w") as f:
-                f.write(_dump_yappi_stats())
+            profiler.stop()
 
         app.router.lifespan_context = lifespan_wrapper
 
         @app.get("/stats")
         def stats():
-            return Response(_dump_yappi_stats())
+            profiler.dump()
+            return Response()
 
     def set_ulimit(self, target_soft_limit: int = 65535):  # pragma: no cover
         # From https://github.com/vllm-project/vllm/blob/fed8a9b107df3e27d57728c6911c7d308b871477/vllm/utils/__init__.py#L2790

From e504716855a55762dda6a747012adced5b16a725 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 20:58:17 -0800
Subject: [PATCH 057/127] fix

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/profiling.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nemo_gym/profiling.py b/nemo_gym/profiling.py
index 91b6cb9b3..319e0d11a 100644
--- a/nemo_gym/profiling.py
+++ b/nemo_gym/profiling.py
@@ -1,5 +1,6 @@
 from io import StringIO
 from pathlib import Path
+from typing import Optional
 
 import yappi
 from gprof2dot import main as gprof2dot_main
@@ -11,6 +12,9 @@ class Profiler(BaseModel):
     name: str
     base_profile_dir: Path
 
+    # Used to clean up and filter out unnecessary information in the yappi log
+    required_str: Optional[str] = None
+
     def start(self) -> None:
         yappi.set_clock_type("CPU")
         yappi.start()
@@ -50,7 +54,7 @@ def dump(self) -> None:
         res = ""
         past_header = False
         for line in buffer:
-            if not past_header or self.config.entrypoint in line:
+            if not past_header or (self.required_str and self.required_str in line):
                 res += line
 
             if line.startswith("name"):

From b7f27a9da187acfb7a732a6facb2d8984ea3f625 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Thu, 22 Jan 2026 21:27:33 -0800
Subject: [PATCH 058/127] try add profiling to instance

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py   | 17 ++++++++++++++---
 responses_api_agents/swe_agents/utils.py |  7 +------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 8338f628f..ce2eb61a2 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import asyncio
 import json
+import os
 import sys
 import time
 import uuid
@@ -40,6 +41,7 @@
     NeMoGymResponseOutputMessage,
     NeMoGymResponseOutputText,
 )
+from nemo_gym.profiling import Profiler
 from responses_api_agents.swe_agents.utils import (
     convert_tools_to_function_format,
     convert_trajectory_to_output_items,
@@ -83,8 +85,14 @@ def runner_ray_remote(
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time
 
+    instance_id = params["problem_info"].get("instance_id", "unknown")
+    profiler = Profiler(name=instance_id, base_profile_dir=params["persistent_dir"] / "profiling")
+    profiler.start()
+
     result = asyncio.run(runner(**params))
 
+    profiler.stop()
+
     ray.get(concurrent_container_counter.decrement.remote())
 
     return result
@@ -214,33 +222,36 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
         # Get model endpoint
         model_endpoint = get_model_endpoint(self.config.model_server.name)
 
-        # Run SWE-bench evaluation
+        # Create persistent directory for I/O and logs in local workspace
         instance_dir = (
             f"{problem_info.get('instance_id', 'unknown')}_{int(time.time() * 1000)}_{str(uuid.uuid4())[:8]}"
         )
+        workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
+        persistent_dir = workspace_root / f"swebench_results_{self.config.run_session_id}" / instance_dir
+        persistent_dir.mkdir(parents=True, exist_ok=True)
         try:
             ray_queue_time = time.time()
             params = {
                 "problem_info": problem_info,
                 "model_endpoint": model_endpoint,
                 "body": body,
-                "run_session_id": self.config.run_session_id,
                 "agent_framework": self.config.agent_framework,
                 "agent_config": self.config.agent_config,
                 "agent_tools_file": self.config.agent_tools_file,
                 "agent_max_turns": self.config.agent_max_turns,
                 "swebench_tests_timeout": self.config.swebench_tests_timeout,
                 "swebench_agent_timeout": self.config.swebench_agent_timeout,
+                "persistent_dir": persistent_dir,
                 "agent_framework_repo": self.config.agent_framework_repo,
                 "agent_framework_commit": self.config.agent_framework_commit,
                 "openhands_setup_dir": self.config.openhands_setup_dir,
                 "swebench_setup_dir": self.config.swebench_setup_dir,
                 "r2e_gym_setup_dir": self.config.r2e_gym_setup_dir,
                 "dataset_path": self.config.dataset_path,
-                "instance_dir": instance_dir,
                 "ray_queue_time": ray_queue_time,
             }
 
+            # Run SWE-bench evaluation
             future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params)
             result = await future
 
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 537824dca..42a24cda9 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -626,28 +626,23 @@ async def run_swebench_evaluation(
     problem_info: Dict,
     model_endpoint: str,
     body: NeMoGymResponseCreateParamsNonStreaming,
-    run_session_id: str,
     agent_framework: str,
     agent_config: Optional[str],
     agent_tools_file: Optional[str],
     agent_max_turns: int,
     swebench_tests_timeout: int,
     swebench_agent_timeout: int,
+    persistent_dir: str,
     agent_framework_repo: Optional[str] = None,
     agent_framework_commit: str = "HEAD",
     openhands_setup_dir: Optional[Path] = None,
     swebench_setup_dir: Optional[Path] = None,
     r2e_gym_setup_dir: Optional[Path] = None,
     dataset_path: Optional[str] = None,
-    instance_dir: Optional[str] = None,
     ray_queue_time: Optional[float] = None,
     ray_submit_time: Optional[float] = None,
 ) -> Dict:
-    # Create persistent directory for I/O and logs in local workspace
-    workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
     instance_id = problem_info.get("instance_id", "unknown")
-    persistent_dir = workspace_root / f"swebench_results_{run_session_id}" / instance_dir
-    persistent_dir.mkdir(parents=True, exist_ok=True)
     output_file = persistent_dir / "output.jsonl"
 
     inference_params = {}

From 744567f5140cb40f46249d5e04fd7df86844ab64 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 10:33:57 -0800
Subject: [PATCH 059/127] pip ng profiling dir

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 1eb4e3313..bee5b7933 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -181,6 +181,7 @@ async def _run_openhands(
         data_point: dict[str, Any],
         api_base: str,
         agent_run_id: str,
+        profiling_dir: str,
         dataset_mount_path: Optional[str] = None,
     ):
         """
@@ -262,6 +263,7 @@ async def _run_openhands(
             "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && "
             # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs)
             "export POETRY_VIRTUALENVS_IN_PROJECT=true && "
+            f"export NG_PROFILING_DIR={profiling_dir} && "
             "export POETRY_VIRTUALENVS_CREATE=false && "
             "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && "
             # TODO (sugam): fix cryptography issue

From 1cbf06335299cfa9e1a840d58b6f4b724d134732 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 10:39:51 -0800
Subject: [PATCH 060/127] pipe

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 3 ++-
 responses_api_agents/swe_agents/utils.py         | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index bee5b7933..4c8eb6256 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -911,7 +911,7 @@ def check_tests_passed(
 
         return required_tests <= passed_tests
 
-    async def process_single_datapoint(self, data_point: dict[str, Any]):
+    async def process_single_datapoint(self, data_point: dict[str, Any], persistent_dir: Path):
         self.output_dir = Path(self.cfg.output_file).parent
 
         agent_run_id = f"{data_point['instance_id']}_{int(time.time())}_{str(uuid.uuid4())[:8]}"
@@ -935,6 +935,7 @@ async def process_single_datapoint(self, data_point: dict[str, Any]):
                     api_base,
                     agent_run_id,
                     instance_dataset_path,
+                    persistent_dir / "profiling",
                 )
             else:
                 raise ValueError(
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 42a24cda9..cc6806791 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -632,7 +632,7 @@ async def run_swebench_evaluation(
     agent_max_turns: int,
     swebench_tests_timeout: int,
     swebench_agent_timeout: int,
-    persistent_dir: str,
+    persistent_dir: Path,
     agent_framework_repo: Optional[str] = None,
     agent_framework_commit: str = "HEAD",
     openhands_setup_dir: Optional[Path] = None,
@@ -683,7 +683,7 @@ async def run_swebench_evaluation(
         dataset_path=dataset_path,
     )
 
-    result = await run_oh.process_single_datapoint(problem_info)
+    result = await run_oh.process_single_datapoint(problem_info, persistent_dir)
     print(f"Process completed for {instance_id}", flush=True)
 
     result["oh_time_metrics"]["ray_time_in_queue"] = ray_submit_time - ray_queue_time

From 6712ab2dc3dba2e438b2993c21694de36e476fea Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 10:43:25 -0800
Subject: [PATCH 061/127] use name

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/profiling.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo_gym/profiling.py b/nemo_gym/profiling.py
index 319e0d11a..dcdc61e51 100644
--- a/nemo_gym/profiling.py
+++ b/nemo_gym/profiling.py
@@ -27,10 +27,10 @@ def stop(self) -> None:
 
     def dump(self) -> None:
         self.base_profile_dir.mkdir(parents=True, exist_ok=True)
-        log_path = self.base_profile_dir / "yappi.log"
-        callgrind_path = self.base_profile_dir / "yappi.callgrind"
-        callgrind_dotfile_path = self.base_profile_dir / "yappi.dot"
-        callgrind_graph_path = self.base_profile_dir / "yappi.png"
+        log_path = self.base_profile_dir / f"{self.name}.log"
+        callgrind_path = self.base_profile_dir / f"{self.name}.callgrind"
+        callgrind_dotfile_path = self.base_profile_dir / f"{self.name}.dot"
+        callgrind_graph_path = self.base_profile_dir / f"{self.name}.png"
 
         yappi.get_func_stats().save(callgrind_path, type="CALLGRIND")
         gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split())

From 17a2ce94596b6c2910dc47879ba08a79f9601bf1 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 10:48:45 -0800
Subject: [PATCH 062/127] try switch commits

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 .../swe_agents/configs/swebench_openhands.yaml                | 2 +-
 .../swe_agents/configs/swebench_openhands_training.yaml       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
index eb4a57583..1e9680950 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
@@ -9,7 +9,7 @@ swe_agents:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
+      agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce
       
       # Container configuration
       container_formatter: ???
diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
index d39fac5e1..17898aa51 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
@@ -8,7 +8,7 @@ swe_agents_train:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
+      agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce
       # Container configuration
       container_formatter: ???
       container_folder_path: null
@@ -39,7 +39,7 @@ swe_agents_val:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
+      agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce
       # Container configuration
       container_formatter: ???
       container_folder_path: null

From 24516f1097a311e51a4bfe31f06630987355e15c Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 11:16:48 -0800
Subject: [PATCH 063/127] bump openhands

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 .../swe_agents/configs/swebench_openhands.yaml                | 2 +-
 .../swe_agents/configs/swebench_openhands_training.yaml       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
index 1e9680950..3aa72d780 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
@@ -9,7 +9,7 @@ swe_agents:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce
+      agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151
       
       # Container configuration
       container_formatter: ???
diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
index 17898aa51..b30304795 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
@@ -8,7 +8,7 @@ swe_agents_train:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce
+      agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151
       # Container configuration
       container_formatter: ???
       container_folder_path: null
@@ -39,7 +39,7 @@ swe_agents_val:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: a7fa35e4ed4dc33d87dc05f049c925252b71bbce
+      agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151
       # Container configuration
       container_formatter: ???
       container_folder_path: null

From d46d4404e1e4c9b4b1459f1ffb16912d3248d739 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 11:37:31 -0800
Subject: [PATCH 064/127] convert to list

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 .../swe_agents/run_openhands.py               | 36 +++++++------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 4c8eb6256..ebc0eea82 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -521,30 +521,20 @@ async def _execute_container_command(
             mount_args.append(f"--mount type=bind,src={venv_path},dst=/openhands_setup/OpenHands/.venv,ro")
             mount_args.append(f"--mount type=bind,src={venv_path},dst={venv_path},ro")
 
-            # make everything in OpenHands read-only
-            mount_args.append(
-                f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands,dst=/openhands_setup/OpenHands,ro"
+            mount_args.extend(
+                [
+                    # make everything in OpenHands read-only
+                    f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands,dst=/openhands_setup/OpenHands,ro",
+                    f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/.eval_sessions,dst=/openhands_setup/OpenHands/.eval_sessions",
+                    f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/.eval_sessions,dst={self.openhands_setup_dir}/OpenHands/.eval_sessions",
+                    f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/logs,dst=/openhands_setup/OpenHands/logs",
+                    f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/logs,dst={self.openhands_setup_dir}/OpenHands/logs",
+                    f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/evaluation/oh,dst=/openhands_setup/OpenHands/evaluation/oh",
+                    f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/evaluation/oh,dst={self.openhands_setup_dir}/OpenHands/evaluation/oh",
+                    # Data
+                    f"--mount type=bind,src={dataset_path_to_mount},dst=/root/dataset/data.jsonl",
+                ]
             )
-            mount_args.append(
-                f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/.eval_sessions,dst=/openhands_setup/OpenHands/.eval_sessions"
-            )
-            mount_args.append(
-                f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/.eval_sessions,dst={self.openhands_setup_dir}/OpenHands/.eval_sessions"
-            )
-            mount_args.append(
-                f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/logs,dst=/openhands_setup/OpenHands/logs"
-            )
-            mount_args.append(
-                f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/logs,dst={self.openhands_setup_dir}/OpenHands/logs"
-            )
-            mount_args.append(
-                f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/evaluation/oh,dst=/openhands_setup/OpenHands/evaluation/oh"
-            )
-            mount_args.append(
-                f"--mount type=bind,src={self.openhands_setup_dir}/OpenHands/evaluation/oh,dst={self.openhands_setup_dir}/OpenHands/evaluation/oh"
-            )
-
-            mount_args.append(f"--mount type=bind,src={dataset_path_to_mount},dst=/root/dataset/data.jsonl")
 
             miniforge3_path = Path(self.openhands_setup_dir) / "miniforge3"
             mount_args.append(f"--mount type=bind,src={miniforge3_path},dst=/openhands_setup/miniforge3,ro")

From 53c009c7d26608062a3852037a1169ba5d3630cc Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 11:44:29 -0800
Subject: [PATCH 065/127] try mount profiling dir

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index ebc0eea82..fe3495548 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -337,6 +337,7 @@ async def _run_openhands(
                 max_retries=1,
                 timeout=self.cfg.swebench_agent_timeout + 60,
                 dataset_mount_path=dataset_mount_path,
+                profiling_dir=profiling_dir,
             )
 
             with open(out_file, "r") as f:
@@ -480,6 +481,7 @@ async def _execute_container_command(
         max_retries: int = 2,
         timeout: int = 45 * 60,  # 45 minutes
         dataset_mount_path: Optional[str] = None,
+        profiling_dir: Optional[str] = None,
     ):
         """Execute a command in an Apptainer container with retry logic."""
         # Find the container using multiple strategies
@@ -536,6 +538,11 @@ async def _execute_container_command(
                 ]
             )
 
+            if profiling_dir:
+                mount_args.append(
+                    f"--mount type=bind,src={profiling_dir},dst={profiling_dir}",
+                )
+
             miniforge3_path = Path(self.openhands_setup_dir) / "miniforge3"
             mount_args.append(f"--mount type=bind,src={miniforge3_path},dst=/openhands_setup/miniforge3,ro")
             mount_args.append(f"--mount type=bind,src={miniforge3_path},dst={miniforge3_path},ro")

From 9e9185ed5d701a1e03354aed9de0ef900fffa1f7 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 12:15:56 -0800
Subject: [PATCH 066/127] print mount args

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index fe3495548..dd0f1d4e0 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -590,6 +590,11 @@ async def _execute_container_command(
         container_commands.append(command)
         combined_command = " && ".join(container_commands)
 
+        # TODO remove
+        import sys
+
+        print("\n".join(mount_args), file=sys.stderr)
+
         mount_str = " ".join(mount_args)
 
         # Launch Apptainer container and execute the command

From af13f0443e1e6a6ea8f7455fd0085660603255b9 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 12:19:33 -0800
Subject: [PATCH 067/127] try reuse trajectories dir

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index dd0f1d4e0..8dd46adc8 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -181,7 +181,6 @@ async def _run_openhands(
         data_point: dict[str, Any],
         api_base: str,
         agent_run_id: str,
-        profiling_dir: str,
         dataset_mount_path: Optional[str] = None,
     ):
         """
@@ -263,7 +262,7 @@ async def _run_openhands(
             "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && "
             # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs)
             "export POETRY_VIRTUALENVS_IN_PROJECT=true && "
-            f"export NG_PROFILING_DIR={profiling_dir} && "
+            f"export NG_PROFILING_DIR=/trajectories_mount/profiling && "
             "export POETRY_VIRTUALENVS_CREATE=false && "
             "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && "
             # TODO (sugam): fix cryptography issue
@@ -337,7 +336,6 @@ async def _run_openhands(
                 max_retries=1,
                 timeout=self.cfg.swebench_agent_timeout + 60,
                 dataset_mount_path=dataset_mount_path,
-                profiling_dir=profiling_dir,
             )
 
             with open(out_file, "r") as f:
@@ -481,7 +479,6 @@ async def _execute_container_command(
         max_retries: int = 2,
         timeout: int = 45 * 60,  # 45 minutes
         dataset_mount_path: Optional[str] = None,
-        profiling_dir: Optional[str] = None,
     ):
         """Execute a command in an Apptainer container with retry logic."""
         # Find the container using multiple strategies
@@ -538,11 +535,6 @@ async def _execute_container_command(
                 ]
             )
 
-            if profiling_dir:
-                mount_args.append(
-                    f"--mount type=bind,src={profiling_dir},dst={profiling_dir}",
-                )
-
             miniforge3_path = Path(self.openhands_setup_dir) / "miniforge3"
             mount_args.append(f"--mount type=bind,src={miniforge3_path},dst=/openhands_setup/miniforge3,ro")
             mount_args.append(f"--mount type=bind,src={miniforge3_path},dst={miniforge3_path},ro")
@@ -937,7 +929,6 @@ async def process_single_datapoint(self, data_point: dict[str, Any], persistent_
                     api_base,
                     agent_run_id,
                     instance_dataset_path,
-                    persistent_dir / "profiling",
                 )
             else:
                 raise ValueError(

From 13f99cc76eccfc05f75831c250c7c4d2f6c17ae2 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 12:22:06 -0800
Subject: [PATCH 068/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 8dd46adc8..29b5de831 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -582,11 +582,6 @@ async def _execute_container_command(
         container_commands.append(command)
         combined_command = " && ".join(container_commands)
 
-        # TODO remove
-        import sys
-
-        print("\n".join(mount_args), file=sys.stderr)
-
         mount_str = " ".join(mount_args)
 
         # Launch Apptainer container and execute the command

From b046da5c57531183d84464c645ff9dc5cff580c9 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 13:33:57 -0800
Subject: [PATCH 069/127] actually print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index cc6806791..dbce75f71 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -17,6 +17,7 @@
 import os
 import shutil
 import subprocess
+import sys
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@@ -774,7 +775,7 @@ def _run_setup_shell_script(
             raise RuntimeError("Failed to capture script output")
 
         for line in process.stdout:
-            print(line, end="", flush=True)
+            print(line, end="", file=sys.stderr)
             output_lines.append(line)
 
         process.wait(timeout=timeout_seconds)

From 94ff7db44c2c2b5594649be2e3399d37980c9133 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 13:45:13 -0800
Subject: [PATCH 070/127] bump openhands

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 .../swe_agents/configs/swebench_openhands.yaml                | 2 +-
 .../swe_agents/configs/swebench_openhands_training.yaml       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
index 3aa72d780..5234a1b22 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
@@ -9,7 +9,7 @@ swe_agents:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151
+      agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01
       
       # Container configuration
       container_formatter: ???
diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
index b30304795..e74bf228f 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
@@ -8,7 +8,7 @@ swe_agents_train:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151
+      agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01
       # Container configuration
       container_formatter: ???
       container_folder_path: null
@@ -39,7 +39,7 @@ swe_agents_val:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 73eef968c098c4524cf373b78a05a58a993ee151
+      agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01
       # Container configuration
       container_formatter: ???
       container_folder_path: null

From ee61b6c7e40be162fbd314f0882ee6c4d385f94e Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 14:02:56 -0800
Subject: [PATCH 071/127] try dict functool to avoid serialization error

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index dbce75f71..417a115fe 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -22,8 +22,6 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
-from openai.types.responses.function_tool import FunctionTool
-
 from nemo_gym.global_config import get_global_config_dict
 from nemo_gym.openai_utils import (
     NeMoGymEasyInputMessage,
@@ -392,15 +390,15 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List:
         # Convert to Response FunctionTool format which is flat
         if tool.get("type") == "function" and "function" in tool:
             func_def = tool["function"]
-            # Create FunctionTool object with flat structure
-            function_tool = FunctionTool(
-                type="function",
-                name=func_def.get("name", ""),
-                description=func_def.get("description"),
-                parameters=func_def.get("parameters"),
-                strict=func_def.get("strict"),  # May be None
+            tools.append(
+                dict(
+                    type="function",
+                    name=func_def.get("name", ""),
+                    description=func_def.get("description"),
+                    parameters=func_def.get("parameters"),
+                    strict=func_def.get("strict"),  # May be None
+                )
             )
-            tools.append(function_tool.model_dump())
     return tools
 
 

From c345ccc9a534b8cb8804983d570bf22a19489b9f Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 14:41:36 -0800
Subject: [PATCH 072/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_models/vllm_model/app.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
index 30bb7a76f..46319303d 100644
--- a/responses_api_models/vllm_model/app.py
+++ b/responses_api_models/vllm_model/app.py
@@ -251,9 +251,6 @@ async def chat_completions(
             else:
                 raise e
 
-        # TODO remove
-        print(chat_completion_dict["usage"], flush=True)
-
         choice_dict = chat_completion_dict["choices"][0]
         if self.config.uses_reasoning_parser:
             reasoning_content = choice_dict["message"].get("reasoning_content")

From 6606ae4ffb6017545a5294e3f56e006db4fcd298 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 14:44:26 -0800
Subject: [PATCH 073/127] print tool

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 417a115fe..8d46132e3 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -390,6 +390,8 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List:
         # Convert to Response FunctionTool format which is flat
         if tool.get("type") == "function" and "function" in tool:
             func_def = tool["function"]
+            # TODO remove
+            print(func_def, file=sys.stderr)
             tools.append(
                 dict(
                     type="function",

From 8a78997064baa059e8694f76356477e2e3139bee Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 14:57:20 -0800
Subject: [PATCH 074/127] just fail

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 8d46132e3..06d244fc7 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -392,6 +392,7 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List:
             func_def = tool["function"]
             # TODO remove
             print(func_def, file=sys.stderr)
+            1 / 0
             tools.append(
                 dict(
                     type="function",

From c1cb16b0bee3f36486dae014f723ef1dc69bcfcc Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 15:02:04 -0800
Subject: [PATCH 075/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 06d244fc7..417a115fe 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -390,9 +390,6 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List:
         # Convert to Response FunctionTool format which is flat
         if tool.get("type") == "function" and "function" in tool:
             func_def = tool["function"]
-            # TODO remove
-            print(func_def, file=sys.stderr)
-            1 / 0
             tools.append(
                 dict(
                     type="function",

From 6d7172e5400ec904b1974b49f5067427997ce3ce Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 15:05:37 -0800
Subject: [PATCH 076/127] try error on warnings

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index ce2eb61a2..26dc2da48 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -17,6 +17,9 @@
 import sys
 import time
 import uuid
+
+# TODO remove if doesn't work
+import warnings
 from asyncio import Semaphore
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional
@@ -55,6 +58,10 @@
 )
 
 
+# Set all warnings to be treated as errors
+warnings.filterwarnings("error")
+
+
 @ray.remote
 class ConcurrentContainerCounter:
     def __init__(self):

From 22342acff4d9586254874291289cd92159898b67 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 15:10:31 -0800
Subject: [PATCH 077/127] print metadata

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 26dc2da48..baa388657 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -314,6 +314,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             if "swe-bench-metrics" in result:
                 metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"])
 
+            # TODO remove
+            print(metadata, file=sys.stderr)
+
             return NeMoGymResponse(
                 id=f"swebench-{problem_info.get('instance_id', 'unknown')}",
                 created_at=int(time.time()),

From 1b74d0ee5aacb2583d95895df56ad98d04f2cbb1 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 15:16:00 -0800
Subject: [PATCH 078/127] print many newlines

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index baa388657..10a6feffa 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -315,7 +315,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"])
 
             # TODO remove
-            print(metadata, file=sys.stderr)
+            print(f"METADATA: {metadata}\n\n\n\n", file=sys.stderr)
 
             return NeMoGymResponse(
                 id=f"swebench-{problem_info.get('instance_id', 'unknown')}",

From 37312c1b9cdb107cdc90220e591dc7db0e9e8a14 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 15:19:57 -0800
Subject: [PATCH 079/127] dump

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 10a6feffa..9dd26773c 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -316,6 +316,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
 
             # TODO remove
             print(f"METADATA: {metadata}\n\n\n\n", file=sys.stderr)
+            with open("temp.json", "w") as f:
+                json.dump(metadata, f)
 
             return NeMoGymResponse(
                 id=f"swebench-{problem_info.get('instance_id', 'unknown')}",

From f600cdb2783a0029c8df7762db31b447579a50e4 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Fri, 23 Jan 2026 15:51:48 -0800
Subject: [PATCH 080/127] feat: update oh w/ mem limt and cmd timeout

Signed-off-by: Sugam Devare <sdevare@nvidia.com>
---
 responses_api_agents/swe_agents/app.py        |  8 ++++++
 .../configs/swebench_openhands.yaml           | 10 ++++---
 .../configs/swebench_openhands_training.yaml  | 16 +++++++-----
 .../swe_agents/run_openhands.py               | 26 ++++++++++++++++---
 responses_api_agents/swe_agents/utils.py      |  4 +++
 5 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 1c0c9b046..3844f8330 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -94,6 +94,12 @@ class SWEBenchWrapperConfig(BaseResponsesAPIAgentConfig):
 
     swebench_agent_timeout: int = Field(default=45 * 60, description="Timeout for running the agent (seconds)")
 
+    apptainer_memory_limit_mb: int = Field(
+        default=32 * 1024, description="Memory limit for the apptainer container (MB)"
+    )
+
+    command_exec_timeout: int = Field(default=5 * 60, description="Timeout for executing the command (seconds)")
+
     # Concurrency control
     concurrency: int = Field(default=256, description="Maximum number of concurrent SWE-bench runs")
 
@@ -212,6 +218,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "dataset_path": self.config.dataset_path,
                 "instance_dir": instance_dir,
                 "ray_queue_time": ray_queue_time,
+                "apptainer_memory_limit_mb": self.config.apptainer_memory_limit_mb,
+                "command_exec_timeout": self.config.command_exec_timeout,
             }
 
             future = runner_ray_remote.remote(run_swebench_evaluation, params)
diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
index eb4a57583..a7260856e 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
@@ -9,14 +9,16 @@ swe_agents:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
+      agent_framework_commit: dfd04f41c9af452a9a230c7378699c6119bcb2db
       
       # Container configuration
       container_formatter: ???
       container_folder_path: null
-      swebench_agent_timeout: 2700 # 45 minutes
-      swebench_tests_timeout: 1800
-
+      swebench_agent_timeout: 1800
+      swebench_tests_timeout: 900
+      apptainer_memory_limit_mb: 32768
+      command_exec_timeout: 300
+      
       dataset_path: ???
       
       # Optional model server reference
diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
index d39fac5e1..bd7f8abb4 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
@@ -8,12 +8,14 @@ swe_agents_train:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
+      agent_framework_commit: dfd04f41c9af452a9a230c7378699c6119bcb2db
       # Container configuration
       container_formatter: ???
       container_folder_path: null
-      swebench_agent_timeout: 2700 # 45 minutes
-      swebench_tests_timeout: 900  # 15 minutes
+      swebench_agent_timeout: 1800
+      swebench_tests_timeout: 900
+      apptainer_memory_limit_mb: 32768
+      command_exec_timeout: 300
       dataset_path: ???
       model_server:
         name: policy_model # openai_model
@@ -39,12 +41,14 @@ swe_agents_val:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: 8acdde3cc6ccdf1e85006da381b7ef73330dd475
+      agent_framework_commit: dfd04f41c9af452a9a230c7378699c6119bcb2db
       # Container configuration
       container_formatter: ???
       container_folder_path: null
-      swebench_agent_timeout: 2700 # 45 minutes
-      swebench_tests_timeout: 1800  # 30 minutes
+      swebench_agent_timeout: 1800
+      swebench_tests_timeout: 900
+      apptainer_memory_limit_mb: 32768
+      command_exec_timeout: 300
       dataset_path: ???
       # Optional model server reference
       model_server:
diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 1eb4e3313..7fdaba4af 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -62,6 +62,8 @@ class SweBenchGenerationConfig:
     agent_max_turns: int = 100
     swebench_tests_timeout: int = 30 * 60
     swebench_agent_timeout: int = 45 * 60
+    apptainer_memory_limit_mb: int = 32 * 1024
+    command_exec_timeout: int = 5 * 60
     inference: SweBenchInferenceConfig = field(default_factory=SweBenchInferenceConfig)
     server: dict = field(default_factory=dict)
 
@@ -224,8 +226,10 @@ async def _run_openhands(
         agent_script_name = f"agent_script_{agent_run_id}.sh"
         cleanup_commands = (
             f"cd /openhands_setup/OpenHands && "
-            f"mkdir -p /trajectories_mount/trajectories && "
-            f"cp -r {eval_dir_in_openhands}/*/*/* /trajectories_mount/trajectories/{data_point['instance_id']}/ &&"
+            f"mkdir -p /trajectories_mount/trajectories/{data_point['instance_id']}/llm_completions/{data_point['instance_id']}/ && "
+            f"cp {eval_dir_in_openhands}/*/*/*/output.jsonl /trajectories_mount/trajectories/{data_point['instance_id']}/ && "
+            f"latest=$(ls -t {eval_dir_in_openhands}/*/*/*/llm_completions/*/*.json 2>/dev/null | head -1); "
+            f'[ -n "$latest" ] && cp "$latest" /trajectories_mount/trajectories/{data_point["instance_id"]}/llm_completions/{data_point["instance_id"]}/ && '
             f"rm -rf {eval_dir_in_openhands} && rm -rf {config_file_path}"
         )
 
@@ -264,6 +268,8 @@ async def _run_openhands(
             "export POETRY_VIRTUALENVS_IN_PROJECT=true && "
             "export POETRY_VIRTUALENVS_CREATE=false && "
             "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && "
+            f"export TMUX_MEMORY_LIMIT={self.cfg.apptainer_memory_limit_mb} && "
+            f"export COMMAND_EXEC_TIMEOUT={self.cfg.command_exec_timeout} && "
             # TODO (sugam): fix cryptography issue
             # "override_dir=$(mktemp -d /tmp/cryptography_override.XXXX) && "
             # # Reinstall cryptography inside the container (via poetry's venv) using a compatible wheel
@@ -358,7 +364,7 @@ async def _run_openhands(
                     )
                 )
         except Exception as e:
-            print(f"oh run_infer.sh output parsing failed: {e}", flush=True)
+            print(f"Running OpenHands failed: {e}", flush=True)
             return None
         return pred_file
 
@@ -595,10 +601,14 @@ async def _execute_container_command(
 
         # Launch Apptainer container and execute the command
         apptainer_cmd = (
-            f"apptainer exec --writable-tmpfs --cleanenv --no-mount home,tmp,bind-paths "
+            f"apptainer exec --writable-tmpfs --cleanenv --pid --no-mount home,tmp,bind-paths "
             f"{mount_str} "
             f" {container_name} bash -c {shlex.quote(combined_command)}"
         )
+        memory_limit_mb = self.cfg.apptainer_memory_limit_mb
+        if memory_limit_mb is not None and memory_limit_mb > 0:
+            memory_limit_kb = int(memory_limit_mb) * 1024
+            apptainer_cmd = f"ulimit -v {memory_limit_kb} && {apptainer_cmd}"
 
         # Retry apptainer command up to max_retries times
         for attempt in range(max_retries):
@@ -633,6 +643,14 @@ async def _execute_container_command(
 
                 if len(pred_files) == 1:
                     return pred_files[0]
+                elif len(pred_files) > 1:
+                    latest_file = max(pred_files, key=os.path.getmtime)
+                    print(
+                        f"Multiple outputs found for {data_point['instance_id']} "
+                        f"({len(pred_files)}). Using latest: {latest_file}",
+                        flush=True,
+                    )
+                    return latest_file
                 else:
                     raise ValueError(
                         f"Expected exactly one file matching {expected_file_pattern} for {data_point['instance_id']}, "
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index e60df2ea9..c0572bb59 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -641,6 +641,8 @@ async def run_swebench_evaluation(
     instance_dir: Optional[str] = None,
     ray_queue_time: Optional[float] = None,
     ray_submit_time: Optional[float] = None,
+    apptainer_memory_limit_mb: Optional[int] = None,
+    command_exec_timeout: Optional[int] = None,
 ) -> Dict:
     # Create persistent directory for I/O and logs in local workspace
     workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
@@ -675,6 +677,8 @@ async def run_swebench_evaluation(
         agent_max_turns=agent_max_turns,
         swebench_tests_timeout=swebench_tests_timeout,
         swebench_agent_timeout=swebench_agent_timeout,
+        apptainer_memory_limit_mb=apptainer_memory_limit_mb,
+        command_exec_timeout=command_exec_timeout,
         inference=inference_config,
         server=server,
     )

From 591b409975d40466f32445ab8e7b71bb1fa7193e Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 15:54:45 -0800
Subject: [PATCH 081/127] error and prints

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 9dd26773c..fb3dfa44a 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -262,6 +262,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params)
             result = await future
 
+            # TODO remove
+            print("HIT 1", file=sys.stderr)
+
             # Extract trajectory and convert to proper NeMoGym format
             output_items = []
             trajectory = result.get("trajectory", [])
@@ -270,6 +273,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             raw_tools = result.get("tools", [])
             tools = convert_tools_to_function_format(raw_tools) if raw_tools else []
 
+            # TODO remove
+            print("HIT 2", file=sys.stderr)
+
             # Convert trajectory to NeMoGym output items
             if trajectory:
                 output_items = convert_trajectory_to_output_items(
@@ -277,6 +283,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                     self.config.agent_framework,
                 )
 
+            # TODO remove
+            print("HIT 3", file=sys.stderr)
+
             # If no trajectory or empty output, create a summary message
             if not output_items:
                 output_items = [
@@ -297,6 +306,9 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                     )
                 ]
 
+            # TODO remove
+            print("HIT 4", file=sys.stderr)
+
             # Store the full result in metadata for the verify step
             # Note: metadata values must be strings for NeMoGymResponse
             metadata = {
@@ -310,12 +322,16 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 if key in result:
                     metadata[key] = str(result[key])
 
+            # TODO remove
+            print("HIT 5", file=sys.stderr)
+
             # For complex metrics, store as JSON string
             if "swe-bench-metrics" in result:
                 metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"])
 
             # TODO remove
             print(f"METADATA: {metadata}\n\n\n\n", file=sys.stderr)
+            1 / 0
             with open("temp.json", "w") as f:
                 json.dump(metadata, f)
 

From 9deffae825e3033b3b2930b2e9f68a3e7a04cf3c Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:01:26 -0800
Subject: [PATCH 082/127] dont error on warnings

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index fb3dfa44a..fb1df1bc8 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -19,7 +19,6 @@
 import uuid
 
 # TODO remove if doesn't work
-import warnings
 from asyncio import Semaphore
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional
@@ -58,10 +57,6 @@
 )
 
 
-# Set all warnings to be treated as errors
-warnings.filterwarnings("error")
-
-
 @ray.remote
 class ConcurrentContainerCounter:
     def __init__(self):

From 83b585171d27bf2be59c3e33756b8890b45137e2 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:12:35 -0800
Subject: [PATCH 083/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index fb1df1bc8..ce2eb61a2 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -17,8 +17,6 @@
 import sys
 import time
 import uuid
-
-# TODO remove if doesn't work
 from asyncio import Semaphore
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional
@@ -257,9 +255,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params)
             result = await future
 
-            # TODO remove
-            print("HIT 1", file=sys.stderr)
-
             # Extract trajectory and convert to proper NeMoGym format
             output_items = []
             trajectory = result.get("trajectory", [])
@@ -268,9 +263,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             raw_tools = result.get("tools", [])
             tools = convert_tools_to_function_format(raw_tools) if raw_tools else []
 
-            # TODO remove
-            print("HIT 2", file=sys.stderr)
-
             # Convert trajectory to NeMoGym output items
             if trajectory:
                 output_items = convert_trajectory_to_output_items(
@@ -278,9 +270,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                     self.config.agent_framework,
                 )
 
-            # TODO remove
-            print("HIT 3", file=sys.stderr)
-
             # If no trajectory or empty output, create a summary message
             if not output_items:
                 output_items = [
@@ -301,9 +290,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                     )
                 ]
 
-            # TODO remove
-            print("HIT 4", file=sys.stderr)
-
             # Store the full result in metadata for the verify step
             # Note: metadata values must be strings for NeMoGymResponse
             metadata = {
@@ -317,19 +303,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 if key in result:
                     metadata[key] = str(result[key])
 
-            # TODO remove
-            print("HIT 5", file=sys.stderr)
-
             # For complex metrics, store as JSON string
             if "swe-bench-metrics" in result:
                 metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"])
 
-            # TODO remove
-            print(f"METADATA: {metadata}\n\n\n\n", file=sys.stderr)
-            1 / 0
-            with open("temp.json", "w") as f:
-                json.dump(metadata, f)
-
             return NeMoGymResponse(
                 id=f"swebench-{problem_info.get('instance_id', 'unknown')}",
                 created_at=int(time.time()),

From 879c623871a9ceb1702c9d4a183de5703d7019be Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:14:15 -0800
Subject: [PATCH 084/127] ignore pydantic serialization warnings

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index ce2eb61a2..e337b991e 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -17,12 +17,14 @@
 import sys
 import time
 import uuid
+import warnings
 from asyncio import Semaphore
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional
 
 import ray
 from pydantic import ConfigDict, Field
+from pydantic_core import PydanticSerializationUnexpectedValue
 
 from nemo_gym.base_resources_server import (
     BaseRunRequest,
@@ -55,6 +57,9 @@
 )
 
 
+warnings.filterwarnings("ignore", category=PydanticSerializationUnexpectedValue)
+
+
 @ray.remote
 class ConcurrentContainerCounter:
     def __init__(self):

From 01f7a8dc67c77035a45ffd3fa9562124c38f717c Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:18:42 -0800
Subject: [PATCH 085/127] try filter by message

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index e337b991e..0f37d2681 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -24,7 +24,6 @@
 
 import ray
 from pydantic import ConfigDict, Field
-from pydantic_core import PydanticSerializationUnexpectedValue
 
 from nemo_gym.base_resources_server import (
     BaseRunRequest,
@@ -57,7 +56,7 @@
 )
 
 
-warnings.filterwarnings("ignore", category=PydanticSerializationUnexpectedValue)
+warnings.filterwarnings("ignore", message="FunctionTool")
 
 
 @ray.remote

From b29b432391f2d953261a0802839602e7743a9c29 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:22:39 -0800
Subject: [PATCH 086/127] revert back to function tool

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 417a115fe..5ef2d72d5 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -22,6 +22,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
+from openai.types.responses.function_tool import FunctionTool
+
 from nemo_gym.global_config import get_global_config_dict
 from nemo_gym.openai_utils import (
     NeMoGymEasyInputMessage,
@@ -390,15 +392,15 @@ def convert_tools_to_function_format(raw_tools: List[Dict]) -> List:
         # Convert to Response FunctionTool format which is flat
         if tool.get("type") == "function" and "function" in tool:
             func_def = tool["function"]
-            tools.append(
-                dict(
-                    type="function",
-                    name=func_def.get("name", ""),
-                    description=func_def.get("description"),
-                    parameters=func_def.get("parameters"),
-                    strict=func_def.get("strict"),  # May be None
-                )
+            # Create FunctionTool object with flat structure
+            function_tool = FunctionTool(
+                type="function",
+                name=func_def.get("name", ""),
+                description=func_def.get("description"),
+                parameters=func_def.get("parameters"),
+                strict=func_def.get("strict"),  # May be None
             )
+            tools.append(function_tool)
     return tools
 
 

From 78dc107a75bfbcc0139f81b102afc0d912e3cce8 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:24:03 -0800
Subject: [PATCH 087/127] add comment

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 0f37d2681..a65aa9cb8 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -56,6 +56,8 @@
 )
 
 
+# There are some mysterious Pydantic serialization warnings related to FunctionTool that are not fatal that clutter up logs.
+# At some point we can try continue chasing this one down.
 warnings.filterwarnings("ignore", message="FunctionTool")
 
 

From 028a9d7cb60b4aabd8ad4e216a0c9a051c304e69 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:25:12 -0800
Subject: [PATCH 088/127] add example

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index a65aa9cb8..aba41d406 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -57,7 +57,8 @@
 
 
 # There are some mysterious Pydantic serialization warnings related to FunctionTool that are not fatal that clutter up logs.
-# At some point we can try continue chasing this one down.
+# At some point we can try continue chasing this one down. Example:
+# (NemoGym pid=3160799) (swe_agents_val)   PydanticSerializationUnexpectedValue(Expected `general-fields` - serialized value may not be as expected [field_name='tools', input_value=FunctionTool(name='str_re... a single call each.\n'), input_type=FunctionTool])
 warnings.filterwarnings("ignore", message="FunctionTool")
 
 

From 99d1fd571551e2a08332478f3b6af077f3720797 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:34:52 -0800
Subject: [PATCH 089/127] try profiling openhands

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 29b5de831..345adbb14 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -262,7 +262,7 @@ async def _run_openhands(
             "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && "
             # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs)
             "export POETRY_VIRTUALENVS_IN_PROJECT=true && "
-            f"export NG_PROFILING_DIR=/trajectories_mount/profiling && "
+            f"export NG_PROFILING_DIR=/trajectories_mount/profiling_openhands && "
             "export POETRY_VIRTUALENVS_CREATE=false && "
             "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && "
             # TODO (sugam): fix cryptography issue

From cb4a0406880ec5a188557cdd84ca8fddd959200d Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:50:18 -0800
Subject: [PATCH 090/127] bump

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 .../swe_agents/configs/swebench_openhands.yaml                | 2 +-
 .../swe_agents/configs/swebench_openhands_training.yaml       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
index 5234a1b22..40744020a 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands.yaml
@@ -9,7 +9,7 @@ swe_agents:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01
+      agent_framework_commit: bxyu/profiling
       
       # Container configuration
       container_formatter: ???
diff --git a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
index e74bf228f..aa5003562 100644
--- a/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
+++ b/responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
@@ -8,7 +8,7 @@ swe_agents_train:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01
+      agent_framework_commit: bxyu/profiling
       # Container configuration
       container_formatter: ???
       container_folder_path: null
@@ -39,7 +39,7 @@ swe_agents_val:
       agent_config: responses_api_agents/swe_agents/configs/oh_config.toml
       agent_max_turns: 100
       agent_framework_repo: https://github.com/sdevare-nv/nv-OpenHands.git
-      agent_framework_commit: a9ce5675d935f52b4d5ca91d723726dc90833f01
+      agent_framework_commit: bxyu/profiling
       # Container configuration
       container_formatter: ???
       container_folder_path: null

From 1a763a711e5f150dffffa22ed3e1432da66a664b Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Fri, 23 Jan 2026 16:56:30 -0800
Subject: [PATCH 091/127] enable logging

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 .../swe_agents/run_openhands.py                | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 345adbb14..2eb315dbc 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -250,14 +250,16 @@ async def _run_openhands(
             # Use pre-built OpenHands
             "cd /openhands_setup/OpenHands && "
             "export RUNTIME=local && "
-            # "export LOG_LEVEL=DEBUG && "
-            # "export LOG_TO_FILE=true && "
-            "export LOG_LEVEL=CRITICAL && "
-            "export DEBUG=False && "
-            "export DEBUG_LLM=False && "
-            "export LOG_TO_FILE=False && "
-            "export LOG_ALL_EVENTS=False && "
-            "export DEBUG_RUNTIME=False && "
+            # Enable these two for debug logging
+            "export LOG_LEVEL=DEBUG && "
+            "export LOG_TO_FILE=true && "
+            # Disable these 5 for logging
+            # "export LOG_LEVEL=CRITICAL && "
+            # "export DEBUG=False && "
+            # "export DEBUG_LLM=False && "
+            # "export LOG_TO_FILE=False && "
+            # "export LOG_ALL_EVENTS=False && "
+            # "export DEBUG_RUNTIME=False && "
             "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && "
             "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && "
             # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs)

From 16e8be129370dbf67b27d3247e82657a3ba7a0b1 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Fri, 23 Jan 2026 17:25:03 -0800
Subject: [PATCH 092/127] feat: move copy logic to host

Signed-off-by: Sugam Devare <sdevare@nvidia.com>
---
 .../swe_agents/run_openhands.py               | 78 +++++++++++++++----
 1 file changed, 63 insertions(+), 15 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 7fdaba4af..c61999018 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -17,6 +17,7 @@
 import os
 import re
 import shlex
+import shutil
 import time
 import uuid
 from dataclasses import dataclass, field
@@ -224,14 +225,6 @@ async def _run_openhands(
         assert self.openhands_setup_dir is not None, "OpenHands setup directory is not set"
 
         agent_script_name = f"agent_script_{agent_run_id}.sh"
-        cleanup_commands = (
-            f"cd /openhands_setup/OpenHands && "
-            f"mkdir -p /trajectories_mount/trajectories/{data_point['instance_id']}/llm_completions/{data_point['instance_id']}/ && "
-            f"cp {eval_dir_in_openhands}/*/*/*/output.jsonl /trajectories_mount/trajectories/{data_point['instance_id']}/ && "
-            f"latest=$(ls -t {eval_dir_in_openhands}/*/*/*/llm_completions/*/*.json 2>/dev/null | head -1); "
-            f'[ -n "$latest" ] && cp "$latest" /trajectories_mount/trajectories/{data_point["instance_id"]}/llm_completions/{data_point["instance_id"]}/ && '
-            f"rm -rf {eval_dir_in_openhands} && rm -rf {config_file_path}"
-        )
 
         agent_main_cmd = (
             "if [ -d /workspace ]; then "
@@ -318,22 +311,18 @@ async def _run_openhands(
         agent_timeout_seconds = self.cfg.swebench_agent_timeout
         openhands_cmd = (
             f"timeout --signal=TERM --kill-after=30 {agent_timeout_seconds} "
-            f"bash /trajectories_mount/{agent_script_name}; "
-            f"echo 'Cleaning up...'; "
-            f"{cleanup_commands}"
+            f"bash /trajectories_mount/{agent_script_name}"
         )
 
         search_path = os.path.join(
-            self.output_dir / "trajectories",
-            "**",
-            data_point["instance_id"],
+            self.openhands_setup_dir / "OpenHands" / eval_dir_in_openhands,
             "**",
             "output.jsonl",
         )
 
         try:
             # Execute OpenHands command
-            out_file = await self._execute_container_command(
+            out_file_in_eval = await self._execute_container_command(
                 data_point=data_point,
                 command=openhands_cmd,
                 expected_file_pattern=search_path,
@@ -342,6 +331,12 @@ async def _run_openhands(
                 timeout=self.cfg.swebench_agent_timeout + 60,
                 dataset_mount_path=dataset_mount_path,
             )
+            out_file = self._openhands_dir_copy_from_host(
+                data_point=data_point,
+                eval_dir_in_openhands=eval_dir_in_openhands,
+                config_file_path=config_file_path,
+                output_file_path=out_file_in_eval,
+            )
 
             with open(out_file, "r") as f:
                 out_dict = json.loads(f.read().strip())
@@ -364,10 +359,63 @@ async def _run_openhands(
                     )
                 )
         except Exception as e:
+            self._openhands_dir_copy_from_host(
+                data_point=data_point,
+                eval_dir_in_openhands=eval_dir_in_openhands,
+                config_file_path=config_file_path,
+                output_file_path=None,
+            )
             print(f"Running OpenHands failed: {e}", flush=True)
             return None
         return pred_file
 
+    def _openhands_dir_copy_from_host(
+        self,
+        data_point: dict[str, Any],
+        eval_dir_in_openhands: str,
+        config_file_path: str,
+        output_file_path: Optional[str],
+    ) -> Optional[str]:
+    
+        eval_dir_on_host = Path(self.openhands_setup_dir) / "OpenHands" / eval_dir_in_openhands
+        trajectories_root = Path(self.output_dir) / "trajectories" / data_point["instance_id"]
+        llm_completions_dir = trajectories_root / "llm_completions" / data_point["instance_id"]
+        trajectories_root.mkdir(parents=True, exist_ok=True)
+        llm_completions_dir.mkdir(parents=True, exist_ok=True)
+
+        dest_output: Optional[str] = None
+        if output_file_path:
+            source_output = Path(output_file_path)
+            if not source_output.is_absolute():
+                source_output = eval_dir_on_host / source_output
+            if not source_output.exists():
+                output_candidates = sorted(eval_dir_on_host.glob("*/*/*/output.jsonl"), key=os.path.getmtime)
+                if not output_candidates:
+                    raise FileNotFoundError(
+                        f"No output.jsonl found under {eval_dir_on_host} for {data_point['instance_id']}."
+                    )
+                source_output = output_candidates[-1]
+
+            dest_output_path = trajectories_root / "output.jsonl"
+            shutil.copy2(source_output, dest_output_path)
+            dest_output = str(dest_output_path)
+
+        completion_candidates = glob.glob(str(eval_dir_on_host / "*/*/*/llm_completions/*/*.json"))
+        if completion_candidates:
+            latest_completion = max(completion_candidates, key=os.path.getmtime)
+            shutil.copy2(
+                latest_completion,
+                llm_completions_dir / Path(latest_completion).name,
+            )
+
+        shutil.rmtree(eval_dir_on_host, ignore_errors=True)
+        try:
+            Path(config_file_path).unlink()
+        except OSError:
+            pass
+
+        return dest_output
+
     def _write_instance_dataset(self, data_point: dict[str, Any], agent_run_id: str) -> Path:
         """
         To avoid making HF dataset API calls, we write the instance dictionary to a file and mount it in the container.

From 9699852c538b8385655797a30b7f84ab1a5bd75c Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Sat, 24 Jan 2026 10:43:44 -0800
Subject: [PATCH 093/127] revert to shared folder

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 2eb315dbc..bf0253b52 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -264,7 +264,7 @@ async def _run_openhands(
             "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && "
             # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs)
             "export POETRY_VIRTUALENVS_IN_PROJECT=true && "
-            f"export NG_PROFILING_DIR=/trajectories_mount/profiling_openhands && "
+            f"export NG_PROFILING_DIR=/trajectories_mount/profiling && "
             "export POETRY_VIRTUALENVS_CREATE=false && "
             "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && "
             # TODO (sugam): fix cryptography issue

From cc43a52f0566db9763affb475c4e3bb6c48d43d2 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Sat, 24 Jan 2026 11:52:20 -0800
Subject: [PATCH 094/127] pipe debug through

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py        | 22 +++++++++------
 .../swe_agents/run_openhands.py               | 28 +++++++++++--------
 responses_api_agents/swe_agents/utils.py      |  8 +++++-
 3 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index aba41d406..44a5777d0 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -86,21 +86,23 @@ def decrement(self):
 def runner_ray_remote(
     concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any]
 ) -> Any:
-    concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
-    print(f"Concurrent container #{concurrent_containers}", file=sys.stderr)
-
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time
 
-    instance_id = params["problem_info"].get("instance_id", "unknown")
-    profiler = Profiler(name=instance_id, base_profile_dir=params["persistent_dir"] / "profiling")
-    profiler.start()
+    if params["debug"]:
+        concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
+        print(f"Concurrent container #{concurrent_containers}", file=sys.stderr)
+
+        instance_id = params["problem_info"].get("instance_id", "unknown")
+        profiler = Profiler(name=instance_id, base_profile_dir=params["persistent_dir"] / "profiling")
+        profiler.start()
 
     result = asyncio.run(runner(**params))
 
-    profiler.stop()
+    if params["debug"]:
+        profiler.stop()
 
-    ray.get(concurrent_container_counter.decrement.remote())
+        ray.get(concurrent_container_counter.decrement.remote())
 
     return result
 
@@ -166,6 +168,8 @@ class SWEBenchWrapperConfig(BaseResponsesAPIAgentConfig):
         description="Session ID for the run",
     )
 
+    debug: bool = False
+
 
 class SWEBenchRunRequest(BaseRunRequest):
     """Request format for SWE-bench runs."""
@@ -210,6 +214,7 @@ def model_post_init(self, __context: Any) -> None:
             self.config.openhands_setup_dir = setup_openhands_environment(
                 agent_framework_repo=self.config.agent_framework_repo,
                 agent_framework_commit=self.config.agent_framework_commit,
+                debug=self.config.debug,
             )
         self.config.swebench_setup_dir = setup_swebench_environment()
         self.config.r2e_gym_setup_dir = setup_r2e_gym_environment()
@@ -256,6 +261,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "r2e_gym_setup_dir": self.config.r2e_gym_setup_dir,
                 "dataset_path": self.config.dataset_path,
                 "ray_queue_time": ray_queue_time,
+                "debug": self.config.debug,
             }
 
             # Run SWE-bench evaluation
diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index bf0253b52..3ce797b42 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -97,6 +97,7 @@ class RunOpenHandsAgent:
     swebench_setup_dir: Path | None = None
     r2e_gym_setup_dir: Path | None = None
     dataset_path: str | None = None
+    debug: bool = False
 
     async def _run_swe_agent(self, data_point, api_base):
         """
@@ -229,6 +230,20 @@ async def _run_openhands(
             f"rm -rf {eval_dir_in_openhands} && rm -rf {config_file_path}"
         )
 
+        if self.debug:
+            log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && "
+            profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && "
+        else:
+            log_cmd = (
+                "export LOG_LEVEL=CRITICAL && "
+                "export DEBUG=False && "
+                "export DEBUG_LLM=False && "
+                "export LOG_TO_FILE=False && "
+                "export LOG_ALL_EVENTS=False && "
+                "export DEBUG_RUNTIME=False && "
+            )
+            profiling_cmd = ""
+
         agent_main_cmd = (
             "if [ -d /workspace ]; then "
             "    echo 'Exiting because /workspace is mounted.' && "
@@ -250,21 +265,12 @@ async def _run_openhands(
             # Use pre-built OpenHands
             "cd /openhands_setup/OpenHands && "
             "export RUNTIME=local && "
-            # Enable these two for debug logging
-            "export LOG_LEVEL=DEBUG && "
-            "export LOG_TO_FILE=true && "
-            # Disable these 5 for logging
-            # "export LOG_LEVEL=CRITICAL && "
-            # "export DEBUG=False && "
-            # "export DEBUG_LLM=False && "
-            # "export LOG_TO_FILE=False && "
-            # "export LOG_ALL_EVENTS=False && "
-            # "export DEBUG_RUNTIME=False && "
+            f"{log_cmd}"
+            f"{profiling_cmd}"
             "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && "
             "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && "
             # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs)
             "export POETRY_VIRTUALENVS_IN_PROJECT=true && "
-            f"export NG_PROFILING_DIR=/trajectories_mount/profiling && "
             "export POETRY_VIRTUALENVS_CREATE=false && "
             "export POETRY_VIRTUALENVS_PATH=/openhands_setup/OpenHands && "
             # TODO (sugam): fix cryptography issue
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 5ef2d72d5..251edf044 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -642,6 +642,7 @@ async def run_swebench_evaluation(
     dataset_path: Optional[str] = None,
     ray_queue_time: Optional[float] = None,
     ray_submit_time: Optional[float] = None,
+    debug: bool = False,
 ) -> Dict:
     instance_id = problem_info.get("instance_id", "unknown")
     output_file = persistent_dir / "output.jsonl"
@@ -682,6 +683,7 @@ async def run_swebench_evaluation(
         swebench_setup_dir=swebench_setup_dir,
         r2e_gym_setup_dir=r2e_gym_setup_dir,
         dataset_path=dataset_path,
+        debug=debug,
     )
 
     result = await run_oh.process_single_datapoint(problem_info, persistent_dir)
@@ -750,6 +752,7 @@ def _run_setup_shell_script(
     timeout_seconds: int,
     label: str,
     timeout_error_message: Optional[str] = None,
+    debug: bool = False,
 ) -> None:
     script_path = setup_dir / script_name
 
@@ -774,8 +777,9 @@ def _run_setup_shell_script(
         if process.stdout is None:
             raise RuntimeError("Failed to capture script output")
 
+        target_file = sys.stderr if debug else sys.stdout
         for line in process.stdout:
-            print(line, end="", file=sys.stderr)
+            print(line, end="", file=target_file)
             output_lines.append(line)
 
         process.wait(timeout=timeout_seconds)
@@ -1039,6 +1043,7 @@ def setup_openhands_environment(
     agent_framework_repo: Optional[str] = "https://github.com/sdevare-nv/nv-OpenHands.git",
     agent_framework_commit: str = "gym",
     setup_dir: Optional[Path] = None,
+    debug: bool = False,
 ) -> Path:
     setup_dir = _resolve_setup_directory(setup_dir, "swe_openhands_setup")
 
@@ -1207,6 +1212,7 @@ def setup_openhands_environment(
             timeout_seconds=1800,
             label="OpenHands",
             timeout_error_message="OpenHands setup timed out after 30 minutes",
+            debug=debug,
         )
 
         print(f"Setup directory: {setup_dir}", flush=True)

From 964a8592d5d439899ef0a1a13db824ec59d2e0ba Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Sat, 24 Jan 2026 11:53:00 -0800
Subject: [PATCH 095/127] add apt instapll graphviz

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 3ce797b42..5720dc2c7 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -232,7 +232,9 @@ async def _run_openhands(
 
         if self.debug:
             log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && "
-            profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && "
+            profiling_cmd = (
+                "export NG_PROFILING_DIR=/trajectories_mount/profiling && apt update && apt install -y graphviz &&"
+            )
         else:
             log_cmd = (
                 "export LOG_LEVEL=CRITICAL && "

From 8c5f56639811f92e69a76b331f314747471b72a2 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Sat, 24 Jan 2026 12:01:39 -0800
Subject: [PATCH 096/127] try apt get

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 5720dc2c7..43a84710f 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -232,9 +232,7 @@ async def _run_openhands(
 
         if self.debug:
             log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && "
-            profiling_cmd = (
-                "export NG_PROFILING_DIR=/trajectories_mount/profiling && apt update && apt install -y graphviz &&"
-            )
+            profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && apt-get update && apt-get install -y graphviz &&"
         else:
             log_cmd = (
                 "export LOG_LEVEL=CRITICAL && "

From 82414d89bdbb176d80d0ff80b14f64a851c74e07 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Sat, 24 Jan 2026 12:07:30 -0800
Subject: [PATCH 097/127] remove apt install

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 43a84710f..3ce797b42 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -232,7 +232,7 @@ async def _run_openhands(
 
         if self.debug:
             log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && "
-            profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && apt-get update && apt-get install -y graphviz &&"
+            profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && "
         else:
             log_cmd = (
                 "export LOG_LEVEL=CRITICAL && "

From 59f8a4d89072c0152d20affceb17a16e285dfc58 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Sat, 24 Jan 2026 12:11:22 -0800
Subject: [PATCH 098/127] dump out afterwards

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 .../swe_agents/run_openhands.py                 | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 3ce797b42..231cda310 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -25,6 +25,8 @@
 from typing import Any, Optional
 
 import tomlkit
+from gprof2dot import main as gprof2dot_main
+from pydot import graph_from_dot_file
 
 
 class SupportedAgentFrameworks(str, Enum):
@@ -366,6 +368,21 @@ async def _run_openhands(
                         }
                     )
                 )
+
+            # Dump out dot and png files from profiling on OpenHands level
+            if self.debug:
+                base_profile_dir = Path(self.output_dir) / "profiling"
+                profiling_name = "openhands"
+                callgrind_path = base_profile_dir / f"{profiling_name}.callgrind"
+                callgrind_dotfile_path = base_profile_dir / f"{profiling_name}.dot"
+                callgrind_graph_path = base_profile_dir / f"{profiling_name}.png"
+
+                gprof2dot_main(
+                    argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()
+                )
+
+                (graph,) = graph_from_dot_file(callgrind_dotfile_path)
+                graph.write_png(callgrind_graph_path)
         except Exception as e:
             print(f"oh run_infer.sh output parsing failed: {e}", flush=True)
             return None

From a3cf012046c5bf8ace4b950f9e3cd0a55c239f84 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Sun, 25 Jan 2026 21:14:31 -0800
Subject: [PATCH 099/127] bump up pct

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 nemo_gym/profiling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_gym/profiling.py b/nemo_gym/profiling.py
index dcdc61e51..c9eb37572 100644
--- a/nemo_gym/profiling.py
+++ b/nemo_gym/profiling.py
@@ -33,7 +33,7 @@ def dump(self) -> None:
         callgrind_graph_path = self.base_profile_dir / f"{self.name}.png"
 
         yappi.get_func_stats().save(callgrind_path, type="CALLGRIND")
-        gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split())
+        gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 5 -n 5 {callgrind_path}".split())
 
         (graph,) = graph_from_dot_file(callgrind_dotfile_path)
         graph.write_png(callgrind_graph_path)

From 54d95c18d91b8a9506c57f52b99bce433f13b63a Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Sun, 25 Jan 2026 21:16:23 -0800
Subject: [PATCH 100/127] increase to 5

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/run_openhands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 231cda310..90ed05ee4 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -378,7 +378,7 @@ async def _run_openhands(
                 callgrind_graph_path = base_profile_dir / f"{profiling_name}.png"
 
                 gprof2dot_main(
-                    argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 1 -n 1 {callgrind_path}".split()
+                    argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 5 -n 5 {callgrind_path}".split()
                 )
 
                 (graph,) = graph_from_dot_file(callgrind_dotfile_path)

From a688dd6580a633a3a009a258600c40084df59d48 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 10:18:41 -0800
Subject: [PATCH 101/127] add hits

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 9e6a9de2c..01e341983 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -276,6 +276,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params)
             result = await future
 
+            print("HIT A", file=sys.stderr)
+
             # Extract trajectory and convert to proper NeMoGym format
             output_items = []
             trajectory = result.get("trajectory", [])
@@ -291,6 +293,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                     self.config.agent_framework,
                 )
 
+            print("HIT B", file=sys.stderr)
+
             # If no trajectory or empty output, create a summary message
             if not output_items:
                 output_items = [
@@ -328,6 +332,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             if "swe-bench-metrics" in result:
                 metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"])
 
+            print("HIT C", file=sys.stderr)
             return NeMoGymResponse(
                 id=f"swebench-{problem_info.get('instance_id', 'unknown')}",
                 created_at=int(time.time()),
@@ -388,6 +393,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             # Run the evaluation
             response = await self.responses(fixed_params)
 
+            print("HIT D", file=sys.stderr)
+
             # Extract initial input messages from the response output and get filtered output
             # These are the system/user messages that were actually sent to the agent
             input_messages, filtered_output = extract_input_messages_from_trajectory(response.output)
@@ -406,6 +413,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             # Remove metadata from response after extracting metrics
             response = response.model_copy(update={"metadata": None})
 
+            print("HIT E", file=sys.stderr)
+
             # Parse metrics from JSON string if present
             metrics = json.loads(metadata.get("swe-bench-metrics", "{}")) if "swe-bench-metrics" in metadata else {}
 
@@ -418,6 +427,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
+            print("HIT F", file=sys.stderr)
+
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
                 responses_create_params=params_with_input,

From aa0b2c76bcd0b90c67548de32fc1d7f7b7ef81b1 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 10:22:55 -0800
Subject: [PATCH 102/127] clean

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 01e341983..9e6a9de2c 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -276,8 +276,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             future = runner_ray_remote.remote(self._container_counter, run_swebench_evaluation, params)
             result = await future
 
-            print("HIT A", file=sys.stderr)
-
             # Extract trajectory and convert to proper NeMoGym format
             output_items = []
             trajectory = result.get("trajectory", [])
@@ -293,8 +291,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                     self.config.agent_framework,
                 )
 
-            print("HIT B", file=sys.stderr)
-
             # If no trajectory or empty output, create a summary message
             if not output_items:
                 output_items = [
@@ -332,7 +328,6 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             if "swe-bench-metrics" in result:
                 metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"])
 
-            print("HIT C", file=sys.stderr)
             return NeMoGymResponse(
                 id=f"swebench-{problem_info.get('instance_id', 'unknown')}",
                 created_at=int(time.time()),
@@ -393,8 +388,6 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             # Run the evaluation
             response = await self.responses(fixed_params)
 
-            print("HIT D", file=sys.stderr)
-
             # Extract initial input messages from the response output and get filtered output
             # These are the system/user messages that were actually sent to the agent
             input_messages, filtered_output = extract_input_messages_from_trajectory(response.output)
@@ -413,8 +406,6 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             # Remove metadata from response after extracting metrics
             response = response.model_copy(update={"metadata": None})
 
-            print("HIT E", file=sys.stderr)
-
             # Parse metrics from JSON string if present
             metrics = json.loads(metadata.get("swe-bench-metrics", "{}")) if "swe-bench-metrics" in metadata else {}
 
@@ -427,8 +418,6 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
-            print("HIT F", file=sys.stderr)
-
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
                 responses_create_params=params_with_input,

From cbf9c35bd7da8e873846348af29fec271593a1ba Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 10:23:50 -0800
Subject: [PATCH 103/127] print each

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 9e6a9de2c..ca9be4808 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -418,6 +418,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
+            print(f"{params_with_input=}\n{response=}\n{metrics=}", file=sys.stderr)
+
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
                 responses_create_params=params_with_input,

From d61f0ee0129f30d9038e8fef7d30db12894243a0 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 10:43:20 -0800
Subject: [PATCH 104/127] try model dump

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index ca9be4808..0f0e2016d 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -423,7 +423,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
                 responses_create_params=params_with_input,
-                response=response,
+                response=response.model_dump(),
                 reward=reward,
                 resolved=1.0 if resolved else 0.0,
                 patch_exists=1.0 if patch_exists else 0.0,

From 16085af32781a329d79b125deed1e4ef13d6036f Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 10:46:13 -0800
Subject: [PATCH 105/127] modeul dump again

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 0f0e2016d..d0024f1b2 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -418,11 +418,9 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
-            print(f"{params_with_input=}\n{response=}\n{metrics=}", file=sys.stderr)
-
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
-                responses_create_params=params_with_input,
+                responses_create_params=params_with_input.model_dump(),
                 response=response.model_dump(),
                 reward=reward,
                 resolved=1.0 if resolved else 0.0,

From 95d08c150b903e8239b34859d09e6f0e8724d3a8 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 10:49:48 -0800
Subject: [PATCH 106/127] breakpoint

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index d0024f1b2..786a67339 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -418,6 +418,11 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
+            params_with_input.model_dump()
+            response.model_dump()
+
+            print("HIT BREAKPOINT")
+
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
                 responses_create_params=params_with_input.model_dump(),

From 0a9e25fac8b8baa952ffd33acda3cbe05b730484 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 10:53:08 -0800
Subject: [PATCH 107/127] stderr

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 786a67339..f70a32e4f 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -421,7 +421,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             params_with_input.model_dump()
             response.model_dump()
 
-            print("HIT BREAKPOINT")
+            print("HIT BREAKPOINT", file=sys.stderr)
 
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(

From 4caf1173d2c978c78bf59afc092f467d11b82f43 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 10:57:14 -0800
Subject: [PATCH 108/127] separate breakpoints

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index f70a32e4f..d4b7da7a1 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -419,9 +419,9 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             reward = 1.0 if resolved else 0.0
 
             params_with_input.model_dump()
+            print("HIT BREAKPOINT A", file=sys.stderr)
             response.model_dump()
-
-            print("HIT BREAKPOINT", file=sys.stderr)
+            print("HIT BREAKPOINT B", file=sys.stderr)
 
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(

From 1fc1a0864ce559145e23dad63703f01bd6481ce1 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 11:00:27 -0800
Subject: [PATCH 109/127] try moppdel dump

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index d4b7da7a1..3b8c23b70 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -398,7 +398,10 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             # Add the extracted input messages and tools to the params
             # Note: tools should already be in the correct format from the response
             params_with_input = fixed_params.model_copy(
-                update={"input": input_messages, "tools": response.tools if response.tools else []}
+                update={
+                    "input": input_messages,
+                    "tools": [t.model_dump() for t in response.tools] if response.tools else [],
+                }
             )
 
             # Extract metrics from response metadata
@@ -425,8 +428,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
-                responses_create_params=params_with_input.model_dump(),
-                response=response.model_dump(),
+                responses_create_params=params_with_input,
+                response=response,
                 reward=reward,
                 resolved=1.0 if resolved else 0.0,
                 patch_exists=1.0 if patch_exists else 0.0,

From 0bf54608d2d2b12eba436349502f973ded3ce359 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 11:05:43 -0800
Subject: [PATCH 110/127] print

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 3b8c23b70..55c562705 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -366,7 +366,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
         """Run and verify SWE-bench solution."""
         async with self.sem:
-            print(f"Semaphore: {self.config.concurrency - self.sem._value} / {self.config.concurrency}", flush=True)
+            if self.config.debug:
+                print(
+                    f"Semaphore: {self.config.concurrency - self.sem._value} / {self.config.concurrency}", flush=True
+                )
             body.responses_create_params.metadata["container_concurrency"] = self.config.concurrency - self.sem._value
 
             # Fix None values in responses_create_params to use defaults
@@ -421,10 +424,9 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
-            params_with_input.model_dump()
+            print(params_with_input.model_dump(), file=sys.stderr)
+            print(params_with_input.metadata, file=sys.stderr)
             print("HIT BREAKPOINT A", file=sys.stderr)
-            response.model_dump()
-            print("HIT BREAKPOINT B", file=sys.stderr)
 
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(

From 466690b16a6fe20877eadf59cf7e377d01a2ed24 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 11:20:26 -0800
Subject: [PATCH 111/127] print type v

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 55c562705..afd5ec812 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -426,6 +426,8 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             print(params_with_input.model_dump(), file=sys.stderr)
             print(params_with_input.metadata, file=sys.stderr)
+            for k, v in params_with_input.metadata.items():
+                print(f"{k}: {type(v)}")
             print("HIT BREAKPOINT A", file=sys.stderr)
 
             # Build verification response with top-level numeric fields for statistics

From 43554f9f8562f6336e752abf9b93b267646366b2 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 11:24:36 -0800
Subject: [PATCH 112/127] stderr

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index afd5ec812..cee4080d0 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -427,7 +427,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
             print(params_with_input.model_dump(), file=sys.stderr)
             print(params_with_input.metadata, file=sys.stderr)
             for k, v in params_with_input.metadata.items():
-                print(f"{k}: {type(v)}")
+                print(f"{k}: {type(v)}", file=sys.stderr)
             print("HIT BREAKPOINT A", file=sys.stderr)
 
             # Build verification response with top-level numeric fields for statistics

From 56384c68dc0eddce9f57a3c1c092c0f0fb221d55 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 11:27:15 -0800
Subject: [PATCH 113/127] resolve metadata

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index cee4080d0..72d1c0c42 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -370,7 +370,9 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
                 print(
                     f"Semaphore: {self.config.concurrency - self.sem._value} / {self.config.concurrency}", flush=True
                 )
-            body.responses_create_params.metadata["container_concurrency"] = self.config.concurrency - self.sem._value
+            body.responses_create_params.metadata["container_concurrency"] = str(
+                self.config.concurrency - self.sem._value
+            )
 
             # Fix None values in responses_create_params to use defaults
             # This is needed because the pydantic model has non-Optional fields with defaults
@@ -424,12 +426,6 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
-            print(params_with_input.model_dump(), file=sys.stderr)
-            print(params_with_input.metadata, file=sys.stderr)
-            for k, v in params_with_input.metadata.items():
-                print(f"{k}: {type(v)}", file=sys.stderr)
-            print("HIT BREAKPOINT A", file=sys.stderr)
-
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
                 responses_create_params=params_with_input,

From 56fdd0024bfa22add736bb7b6bf79f8430261865 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 13:47:19 -0800
Subject: [PATCH 114/127] openhands hsould log

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py           | 2 ++
 responses_api_agents/swe_agents/run_openhands.py | 8 ++++++--
 responses_api_agents/swe_agents/utils.py         | 2 ++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 72d1c0c42..4f4773690 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -174,6 +174,7 @@ class SWEBenchWrapperConfig(BaseResponsesAPIAgentConfig):
         description="Session ID for the run",
     )
 
+    openhands_should_log: bool = False
     debug: bool = False
 
 
@@ -267,6 +268,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "r2e_gym_setup_dir": self.config.r2e_gym_setup_dir,
                 "dataset_path": self.config.dataset_path,
                 "ray_queue_time": ray_queue_time,
+                "openhands_should_log": self.config.openhands_should_log,
                 "debug": self.config.debug,
                 "apptainer_memory_limit_mb": self.config.apptainer_memory_limit_mb,
                 "command_exec_timeout": self.config.command_exec_timeout,
diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 11e1611ae..2bdf6ef11 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -102,6 +102,7 @@ class RunOpenHandsAgent:
     swebench_setup_dir: Path | None = None
     r2e_gym_setup_dir: Path | None = None
     dataset_path: str | None = None
+    openhands_should_log: bool = False
     debug: bool = False
 
     async def _run_swe_agent(self, data_point, api_base):
@@ -230,8 +231,12 @@ async def _run_openhands(
         agent_script_name = f"agent_script_{agent_run_id}.sh"
 
         if self.debug:
-            log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && "
             profiling_cmd = "export NG_PROFILING_DIR=/trajectories_mount/profiling && "
+        else:
+            profiling_cmd = ""
+
+        if self.openhands_should_log:
+            log_cmd = "export LOG_LEVEL=DEBUG && export LOG_TO_FILE=true && export NG_OPENHANDS_SHOULD_LOG=true && "
         else:
             log_cmd = (
                 "export LOG_LEVEL=CRITICAL && "
@@ -241,7 +246,6 @@ async def _run_openhands(
                 "export LOG_ALL_EVENTS=False && "
                 "export DEBUG_RUNTIME=False && "
             )
-            profiling_cmd = ""
 
         agent_main_cmd = (
             "if [ -d /workspace ]; then "
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 80f39c061..14dc7912f 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -642,6 +642,7 @@ async def run_swebench_evaluation(
     dataset_path: Optional[str] = None,
     ray_queue_time: Optional[float] = None,
     ray_submit_time: Optional[float] = None,
+    openhands_should_log: bool = False,
     debug: bool = False,
     apptainer_memory_limit_mb: Optional[int] = None,
     command_exec_timeout: Optional[int] = None,
@@ -687,6 +688,7 @@ async def run_swebench_evaluation(
         swebench_setup_dir=swebench_setup_dir,
         r2e_gym_setup_dir=r2e_gym_setup_dir,
         dataset_path=dataset_path,
+        openhands_should_log=openhands_should_log,
         debug=debug,
     )
 

From dccd50e3da1fa2aefcca8b16cb98702844b89e2b Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Mon, 26 Jan 2026 13:58:07 -0800
Subject: [PATCH 115/127] pipe global config dict

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py           | 6 ++++++
 responses_api_agents/swe_agents/run_openhands.py | 2 ++
 responses_api_agents/swe_agents/utils.py         | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 4f4773690..d729a63c1 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -14,6 +14,7 @@
 import asyncio
 import json
 import os
+import shlex
 import sys
 import time
 import uuid
@@ -36,6 +37,7 @@
     SimpleResponsesAPIAgent,
 )
 from nemo_gym.config_types import ModelServerRef
+from nemo_gym.global_config import OmegaConf, get_global_config_dict
 from nemo_gym.openai_utils import (
     NeMoGymResponse,
     NeMoGymResponseCreateParamsNonStreaming,
@@ -210,6 +212,7 @@ class SWEBenchWrapper(SimpleResponsesAPIAgent):
     config: SWEBenchWrapperConfig
     sem: Semaphore = None
     _container_counter: ConcurrentContainerCounter = None
+    _global_config_dict_str: str = None
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     def model_post_init(self, __context: Any) -> None:
@@ -231,6 +234,8 @@ def model_post_init(self, __context: Any) -> None:
         self.config.run_session_id = f"{int(time.time() * 1000)}_{str(uuid.uuid4())[:8]}"
         print(f"Run session ID: {self.config.run_session_id}", flush=True)
 
+        self._global_config_dict_str = shlex.quote(OmegaConf.to_yaml(get_global_config_dict()))
+
     async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()) -> NeMoGymResponse:
         # Extract problem information from request
         problem_info = extract_problem_info(
@@ -270,6 +275,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "ray_queue_time": ray_queue_time,
                 "openhands_should_log": self.config.openhands_should_log,
                 "debug": self.config.debug,
+                "ng_global_config_dict_str": self._global_config_dict_str,
                 "apptainer_memory_limit_mb": self.config.apptainer_memory_limit_mb,
                 "command_exec_timeout": self.config.command_exec_timeout,
             }
diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 2bdf6ef11..6d8e4a203 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -97,6 +97,7 @@ class SweBenchGenerationConfig:
 @dataclass
 class RunOpenHandsAgent:
     cfg: SweBenchGenerationConfig
+    ng_global_config_dict_str: str
     output_dir: str = None
     openhands_setup_dir: Path | None = None
     swebench_setup_dir: Path | None = None
@@ -270,6 +271,7 @@ async def _run_openhands(
             "export RUNTIME=local && "
             f"{log_cmd}"
             f"{profiling_cmd}"
+            f"export NEMO_GYM_CONFIG_DICT={self.ng_global_config_dict_str} && "
             "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && "
             "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && "
             # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs)
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 14dc7912f..791876a00 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -634,6 +634,7 @@ async def run_swebench_evaluation(
     swebench_tests_timeout: int,
     swebench_agent_timeout: int,
     persistent_dir: Path,
+    ng_global_config_dict_str: str,
     agent_framework_repo: Optional[str] = None,
     agent_framework_commit: str = "HEAD",
     openhands_setup_dir: Optional[Path] = None,
@@ -688,6 +689,7 @@ async def run_swebench_evaluation(
         swebench_setup_dir=swebench_setup_dir,
         r2e_gym_setup_dir=r2e_gym_setup_dir,
         dataset_path=dataset_path,
+        ng_global_config_dict_str=ng_global_config_dict_str,
         openhands_should_log=openhands_should_log,
         debug=debug,
     )

From 26dcf232c4f57dcce84c04173ddde543869182fe Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 27 Jan 2026 09:54:27 -0800
Subject: [PATCH 116/127] use num cpus 1

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index d729a63c1..78e010016 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -83,7 +83,7 @@ def decrement(self):
     runtime_env={
         "py_executable": sys.executable,
     },
-    num_cpus=0.5,
+    num_cpus=1,
 )
 def runner_ray_remote(
     concurrent_container_counter: ConcurrentContainerCounter, runner: Callable, params: dict[str, Any]

From 92e00fd01d48c1eef013716e9f31e8b9864604f8 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 27 Jan 2026 21:04:42 -0800
Subject: [PATCH 117/127] pipe model name

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py           | 1 +
 responses_api_agents/swe_agents/run_openhands.py | 2 ++
 responses_api_agents/swe_agents/utils.py         | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 78e010016..3376ede08 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -275,6 +275,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "ray_queue_time": ray_queue_time,
                 "openhands_should_log": self.config.openhands_should_log,
                 "debug": self.config.debug,
+                "model_server_name": self.config.model_server.name,
                 "ng_global_config_dict_str": self._global_config_dict_str,
                 "apptainer_memory_limit_mb": self.config.apptainer_memory_limit_mb,
                 "command_exec_timeout": self.config.command_exec_timeout,
diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 6d8e4a203..c5cdcdfdf 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -98,6 +98,7 @@ class SweBenchGenerationConfig:
 class RunOpenHandsAgent:
     cfg: SweBenchGenerationConfig
     ng_global_config_dict_str: str
+    model_server_name: str
     output_dir: str = None
     openhands_setup_dir: Path | None = None
     swebench_setup_dir: Path | None = None
@@ -272,6 +273,7 @@ async def _run_openhands(
             f"{log_cmd}"
             f"{profiling_cmd}"
             f"export NEMO_GYM_CONFIG_DICT={self.ng_global_config_dict_str} && "
+            f"export NEMO_GYM_MODEL_SERVER_NAME={self.model_server_name} &&"
             "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && "
             "export PATH=$PATH:/openhands_setup/OpenHands/.venv/bin && "
             # CRITICAL: Configure poetry to only use the OpenHands venv (ignore external venvs)
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 791876a00..61d38085f 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -635,6 +635,7 @@ async def run_swebench_evaluation(
     swebench_agent_timeout: int,
     persistent_dir: Path,
     ng_global_config_dict_str: str,
+    model_server_name: str,
     agent_framework_repo: Optional[str] = None,
     agent_framework_commit: str = "HEAD",
     openhands_setup_dir: Optional[Path] = None,
@@ -692,6 +693,7 @@ async def run_swebench_evaluation(
         ng_global_config_dict_str=ng_global_config_dict_str,
         openhands_should_log=openhands_should_log,
         debug=debug,
+        model_server_name=model_server_name,
     )
 
     result = await run_oh.process_single_datapoint(problem_info, persistent_dir)

From 1d6ce57f0a5ac77006a178587f10836b99a47d68 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:03:34 -0800
Subject: [PATCH 118/127] start add profiling metrics

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 3376ede08..d5b3aff98 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -205,6 +205,27 @@ class SWEBenchVerifyResponse(BaseVerifyResponse):
     patch_exists: Optional[float] = None  # 1.0 if patch exists, 0.0 otherwise
     patch_successfully_applied: Optional[float] = None  # 1.0 if patch applied, 0.0 otherwise
 
+    # Profiling time metrics to report
+    # ray_queue_time: float
+    # generation_apptainer_spinup_time: float
+    # create_runtime_time: float
+    # container_initialization_time: float
+    # connect_to_runtime_time: float
+    # runtime_initialization_fn_time: float
+    # total_command_exec_time: float
+    # total_model_call_time: float
+    # final_eval_apptainer_spinup_time: float
+    # final_eval_time: float
+
+    # Exit condition metrics to report
+    # TODO add more exit conditions
+    # hit_sample_timeout: bool
+    # hit_trajectory_command_exec_timeout: bool
+    # hit_eval_timeout: bool
+    # hit_results_parsing_failure: bool
+    # hit_success: bool
+    # hit_unknown: bool
+
 
 class SWEBenchWrapper(SimpleResponsesAPIAgent):
     """Wrapper for NeMo-Skills SWE-bench evaluation in NeMo-Gym."""
@@ -326,6 +347,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "agent_framework": self.config.agent_framework,
                 "has_trajectory": str(trajectory is not None),
                 "instance_id": result.get("instance_id", problem_info.get("instance_id", "unknown")),
+                "instance_dir": instance_dir,
             }
 
             # Add evaluation results to metadata (convert to strings)

From b8212434e5cd31cbd4a95a2de8f819a434b4466d Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:05:25 -0800
Subject: [PATCH 119/127] add placeholder

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index d5b3aff98..5fa0f32c1 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -473,6 +473,22 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
                     "patch_successfully_applied": patch_applied,
                     "resolved": resolved,
                 },
+                # ray_queue_time=,
+                # generation_apptainer_spinup_time=,
+                # create_runtime_time=,
+                # container_initialization_time=,
+                # connect_to_runtime_time=,
+                # runtime_initialization_fn_time=,
+                # total_command_exec_time=,
+                # total_model_call_time=,
+                # final_eval_apptainer_spinup_time=,
+                # final_eval_time=,
+                # hit_sample_timeout=,
+                # hit_trajectory_command_exec_timeout=,
+                # hit_eval_timeout=,
+                # hit_results_parsing_failure=,
+                # hit_success=,
+                # hit_unknown=,
             )
 
 

From 2f49809fba2ce9283c9f80a1f9919741e028df20 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:19:55 -0800
Subject: [PATCH 120/127] hit success

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 5fa0f32c1..050126834 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -348,6 +348,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "has_trajectory": str(trajectory is not None),
                 "instance_id": result.get("instance_id", problem_info.get("instance_id", "unknown")),
                 "instance_dir": instance_dir,
+                "hit_success_str": json.dumps(bool(output_items)),
             }
 
             # Add evaluation results to metadata (convert to strings)
@@ -391,7 +392,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 parallel_tool_calls=False,
                 tool_choice="none",
                 tools=[],
-                metadata={"error": str(e)},
+                metadata={
+                    "error": str(e),
+                    "hit_success_str": json.dumps(False),
+                },
             )
 
     async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
@@ -487,7 +491,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
                 # hit_trajectory_command_exec_timeout=,
                 # hit_eval_timeout=,
                 # hit_results_parsing_failure=,
-                # hit_success=,
+                hit_success=json.loads(metadata["hit_success_str"]),
                 # hit_unknown=,
             )
 

From 7e018e4a97fc0144892da3552f02a5e53607ba96 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:20:05 -0800
Subject: [PATCH 121/127] hit success

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 050126834..382e8aa3c 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -223,7 +223,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse):
     # hit_trajectory_command_exec_timeout: bool
     # hit_eval_timeout: bool
     # hit_results_parsing_failure: bool
-    # hit_success: bool
+    hit_success: bool
     # hit_unknown: bool
 
 

From 2a73772e4050ed581bdb571cce938dce1d3b3da4 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:24:55 -0800
Subject: [PATCH 122/127] hit unknown

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 382e8aa3c..a573c21ab 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -224,7 +224,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse):
     # hit_eval_timeout: bool
     # hit_results_parsing_failure: bool
     hit_success: bool
-    # hit_unknown: bool
+    hit_unknown: bool
 
 
 class SWEBenchWrapper(SimpleResponsesAPIAgent):
@@ -461,6 +461,15 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
+            hit_metrics = dict(
+                # hit_sample_timeout=,
+                # hit_trajectory_command_exec_timeout=,
+                # hit_eval_timeout=,
+                # hit_results_parsing_failure=,
+                hit_success=json.loads(metadata["hit_success_str"]),
+            )
+            hit_metrics["hit_unknown"] = not any(hit_metrics.values())
+
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(
                 responses_create_params=params_with_input,
@@ -487,12 +496,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
                 # total_model_call_time=,
                 # final_eval_apptainer_spinup_time=,
                 # final_eval_time=,
-                # hit_sample_timeout=,
-                # hit_trajectory_command_exec_timeout=,
-                # hit_eval_timeout=,
-                # hit_results_parsing_failure=,
-                hit_success=json.loads(metadata["hit_success_str"]),
-                # hit_unknown=,
+                **hit_metrics,
             )
 
 

From d648e56217c66775da197fcd82d5a581bf30a1cd Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:33:36 -0800
Subject: [PATCH 123/127] hit_empty_trajectory

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index a573c21ab..f04ada4ec 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -222,7 +222,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse):
     # hit_sample_timeout: bool
     # hit_trajectory_command_exec_timeout: bool
     # hit_eval_timeout: bool
-    # hit_results_parsing_failure: bool
+    hit_empty_trajectory: bool
     hit_success: bool
     hit_unknown: bool
 
@@ -349,6 +349,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "instance_id": result.get("instance_id", problem_info.get("instance_id", "unknown")),
                 "instance_dir": instance_dir,
                 "hit_success_str": json.dumps(bool(output_items)),
+                "hit_empty_trajectory_str": json.dumps(not trajectory),
             }
 
             # Add evaluation results to metadata (convert to strings)
@@ -395,6 +396,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 metadata={
                     "error": str(e),
                     "hit_success_str": json.dumps(False),
+                    "hit_empty_trajectory_str": json.dumps((not trajectory) if "trajectory" in dir() else False),
                 },
             )
 
@@ -466,6 +468,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
                 # hit_trajectory_command_exec_timeout=,
                 # hit_eval_timeout=,
                 # hit_results_parsing_failure=,
+                hit_empty_trajectory=json.loads(metadata["hit_empty_trajectory_str"]),
                 hit_success=json.loads(metadata["hit_success_str"]),
             )
             hit_metrics["hit_unknown"] = not any(hit_metrics.values())

From 0ab3b8606add5a31f13777404768a758431d88de Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:38:10 -0800
Subject: [PATCH 124/127] hit_responses_exception

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index f04ada4ec..64d8b137b 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -224,7 +224,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse):
     # hit_eval_timeout: bool
     hit_empty_trajectory: bool
     hit_success: bool
-    hit_unknown: bool
+    hit_responses_exception: bool
 
 
 class SWEBenchWrapper(SimpleResponsesAPIAgent):
@@ -350,6 +350,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "instance_dir": instance_dir,
                 "hit_success_str": json.dumps(bool(output_items)),
                 "hit_empty_trajectory_str": json.dumps(not trajectory),
+                "hit_responses_exception_str": json.dumps(False),
             }
 
             # Add evaluation results to metadata (convert to strings)
@@ -397,6 +398,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                     "error": str(e),
                     "hit_success_str": json.dumps(False),
                     "hit_empty_trajectory_str": json.dumps((not trajectory) if "trajectory" in dir() else False),
+                    "hit_responses_exception_str": json.dumps(True),
                 },
             )
 
@@ -463,15 +465,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
 
             reward = 1.0 if resolved else 0.0
 
-            hit_metrics = dict(
-                # hit_sample_timeout=,
-                # hit_trajectory_command_exec_timeout=,
-                # hit_eval_timeout=,
-                # hit_results_parsing_failure=,
-                hit_empty_trajectory=json.loads(metadata["hit_empty_trajectory_str"]),
-                hit_success=json.loads(metadata["hit_success_str"]),
-            )
-            hit_metrics["hit_unknown"] = not any(hit_metrics.values())
+            hit_metrics = {k.removesuffix("_str"): json.loads(v) for k, v in metadata.items() if k.startswith("hit_")}
 
             # Build verification response with top-level numeric fields for statistics
             return SWEBenchVerifyResponse(

From 769c155825c9a0aba003489c10edc2ce85cc2045 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:42:08 -0800
Subject: [PATCH 125/127] plumb NEMO_GYM_METRICS_FPATH

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py           | 2 ++
 responses_api_agents/swe_agents/run_openhands.py | 2 ++
 responses_api_agents/swe_agents/utils.py         | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 64d8b137b..658578518 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -274,6 +274,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
         workspace_root = Path(os.path.dirname(os.path.abspath(__file__)))
         persistent_dir = workspace_root / f"swebench_results_{self.config.run_session_id}" / instance_dir
         persistent_dir.mkdir(parents=True, exist_ok=True)
+        metrics_fpath = persistent_dir / "nemo_gym_metrics.json"
         try:
             ray_queue_time = time.time()
             params = {
@@ -287,6 +288,7 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
                 "swebench_tests_timeout": self.config.swebench_tests_timeout,
                 "swebench_agent_timeout": self.config.swebench_agent_timeout,
                 "persistent_dir": persistent_dir,
+                "metrics_fpath": metrics_fpath,
                 "agent_framework_repo": self.config.agent_framework_repo,
                 "agent_framework_commit": self.config.agent_framework_commit,
                 "openhands_setup_dir": self.config.openhands_setup_dir,
diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index c5cdcdfdf..073097678 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -106,6 +106,7 @@ class RunOpenHandsAgent:
     dataset_path: str | None = None
     openhands_should_log: bool = False
     debug: bool = False
+    metrics_fpath: Path
 
     async def _run_swe_agent(self, data_point, api_base):
         """
@@ -272,6 +273,7 @@ async def _run_openhands(
             "export RUNTIME=local && "
             f"{log_cmd}"
             f"{profiling_cmd}"
+            f"export NEMO_GYM_METRICS_FPATH={self.metrics_fpath} && "
             f"export NEMO_GYM_CONFIG_DICT={self.ng_global_config_dict_str} && "
             f"export NEMO_GYM_MODEL_SERVER_NAME={self.model_server_name} &&"
             "export VIRTUAL_ENV=/openhands_setup/OpenHands/.venv && "
diff --git a/responses_api_agents/swe_agents/utils.py b/responses_api_agents/swe_agents/utils.py
index 61d38085f..5f08d3a7e 100644
--- a/responses_api_agents/swe_agents/utils.py
+++ b/responses_api_agents/swe_agents/utils.py
@@ -634,6 +634,7 @@ async def run_swebench_evaluation(
     swebench_tests_timeout: int,
     swebench_agent_timeout: int,
     persistent_dir: Path,
+    metrics_fpath: Path,
     ng_global_config_dict_str: str,
     model_server_name: str,
     agent_framework_repo: Optional[str] = None,
@@ -694,6 +695,7 @@ async def run_swebench_evaluation(
         openhands_should_log=openhands_should_log,
         debug=debug,
         model_server_name=model_server_name,
+        metrics_fpath=metrics_fpath,
     )
 
     result = await run_oh.process_single_datapoint(problem_info, persistent_dir)

From 3a42aef4cddd4faddaf4079fda8537f3e00ad8e1 Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 10:50:15 -0800
Subject: [PATCH 126/127] report time metrics

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 658578518..711f5c1a3 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -91,6 +91,10 @@ def runner_ray_remote(
     ray_submit_time = time.time()
     params["ray_submit_time"] = ray_submit_time
 
+    # This is the first instance so we don't need to load anything
+    with params["metrics_fpath"].open("w") as f:
+        json.dump({"ray_queue_time": ray_submit_time - params["ray_queue_time"]}, f)
+
     if params["debug"]:
         concurrent_containers = ray.get(concurrent_container_counter.increment.remote())
         print(f"Concurrent container #{concurrent_containers}", file=sys.stderr)
@@ -206,7 +210,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse):
     patch_successfully_applied: Optional[float] = None  # 1.0 if patch applied, 0.0 otherwise
 
     # Profiling time metrics to report
-    # ray_queue_time: float
+    ray_queue_time: float
     # generation_apptainer_spinup_time: float
     # create_runtime_time: float
     # container_initialization_time: float
@@ -364,6 +368,8 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
             if "swe-bench-metrics" in result:
                 metadata["swe-bench-metrics"] = json.dumps(result["swe-bench-metrics"])
 
+            metadata["timing_metrics"] = metrics_fpath.read_text()
+
             return NeMoGymResponse(
                 id=f"swebench-{problem_info.get('instance_id', 'unknown')}",
                 created_at=int(time.time()),
@@ -485,16 +491,7 @@ async def run(self, body: SWEBenchRunRequest) -> SWEBenchVerifyResponse:
                     "patch_successfully_applied": patch_applied,
                     "resolved": resolved,
                 },
-                # ray_queue_time=,
-                # generation_apptainer_spinup_time=,
-                # create_runtime_time=,
-                # container_initialization_time=,
-                # connect_to_runtime_time=,
-                # runtime_initialization_fn_time=,
-                # total_command_exec_time=,
-                # total_model_call_time=,
-                # final_eval_apptainer_spinup_time=,
-                # final_eval_time=,
+                **json.loads(metadata["timing_metrics"]),
                 **hit_metrics,
             )
 

From 40874f8b8e61a9c033937384e75d78e84c89e09d Mon Sep 17 00:00:00 2001
From: Brian Yu <bxyu@nvidia.com>
Date: Tue, 3 Feb 2026 13:01:44 -0800
Subject: [PATCH 127/127] final eval time

Signed-off-by: Brian Yu <bxyu@nvidia.com>
---
 responses_api_agents/swe_agents/app.py           | 2 +-
 responses_api_agents/swe_agents/run_openhands.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index 711f5c1a3..bc27ad071 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -219,7 +219,7 @@ class SWEBenchVerifyResponse(BaseVerifyResponse):
     # total_command_exec_time: float
     # total_model_call_time: float
     # final_eval_apptainer_spinup_time: float
-    # final_eval_time: float
+    final_eval_time: float
 
     # Exit condition metrics to report
     # TODO add more exit conditions
diff --git a/responses_api_agents/swe_agents/run_openhands.py b/responses_api_agents/swe_agents/run_openhands.py
index 073097678..8007cd492 100644
--- a/responses_api_agents/swe_agents/run_openhands.py
+++ b/responses_api_agents/swe_agents/run_openhands.py
@@ -1116,6 +1116,10 @@ async def process_single_datapoint(self, data_point: dict[str, Any], persistent_
                 "evaluation_time": evaluation_time,
             }
 
+            nemo_gym_metrics = json.loads(self.metrics_fpath.read_text())
+            with self.metrics_fpath.open("w") as f:
+                json.dump(nemo_gym_metrics | {"final_eval_time": evaluation_time}, f)
+
             return output_dict
         finally:
             self._cleanup_instance_dataset(instance_dataset_path)