Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
129 commits
Select commit Hold shift + click to select a range
9a478ef
print client response error on debug
bxyu-nvidia Jan 21, 2026
5c0f158
print result
bxyu-nvidia Jan 22, 2026
ec0f4d4
print params
bxyu-nvidia Jan 22, 2026
f3b8e3b
pritn
bxyu-nvidia Jan 22, 2026
8b9b287
print
bxyu-nvidia Jan 22, 2026
86b6fd9
clean
bxyu-nvidia Jan 22, 2026
f46109f
try traceback
bxyu-nvidia Jan 22, 2026
8971ad5
print
bxyu-nvidia Jan 22, 2026
11eef90
print
bxyu-nvidia Jan 22, 2026
6d384fc
print
bxyu-nvidia Jan 22, 2026
91271d5
print
bxyu-nvidia Jan 22, 2026
35cb228
clean
bxyu-nvidia Jan 22, 2026
cb0ff0b
print
bxyu-nvidia Jan 22, 2026
badddba
print
bxyu-nvidia Jan 22, 2026
4e239f7
print exc
bxyu-nvidia Jan 22, 2026
890376e
print
bxyu-nvidia Jan 22, 2026
702e299
print
bxyu-nvidia Jan 22, 2026
3b911a7
print hit
bxyu-nvidia Jan 22, 2026
6ba18ad
clean
bxyu-nvidia Jan 22, 2026
3726bba
clean
bxyu-nvidia Jan 22, 2026
7ae04fa
clean
bxyu-nvidia Jan 22, 2026
78cce58
feat: oh metrics block commands
sdevare-nv Jan 22, 2026
f14ab3c
Merge branch 'sdd/oh-metric-block-commands' of https://github.com/NVI…
bxyu-nvidia Jan 22, 2026
2dac871
try cpus 0.5; print num containers in parallel
bxyu-nvidia Jan 22, 2026
be3f4c2
try add container counter param
bxyu-nvidia Jan 22, 2026
98efc9f
use private
bxyu-nvidia Jan 22, 2026
1709e9e
ray get
bxyu-nvidia Jan 22, 2026
a4ed09d
fix
bxyu-nvidia Jan 22, 2026
f912c54
print usage
bxyu-nvidia Jan 22, 2026
5ffe448
flush
bxyu-nvidia Jan 22, 2026
f9450a2
try disable
bxyu-nvidia Jan 22, 2026
8575e36
revert prefix server logs
bxyu-nvidia Jan 22, 2026
5f7079d
clean
bxyu-nvidia Jan 22, 2026
5403fbc
clean
bxyu-nvidia Jan 22, 2026
8fc2ebd
clean
bxyu-nvidia Jan 22, 2026
ba6153c
try logger warning
bxyu-nvidia Jan 22, 2026
f56c36f
print usage again
bxyu-nvidia Jan 22, 2026
49c7a90
try info
bxyu-nvidia Jan 22, 2026
d1bda2d
try info into warning
bxyu-nvidia Jan 22, 2026
dbd49de
try redirect stdout
bxyu-nvidia Jan 22, 2026
3025ad6
redirect inside too
bxyu-nvidia Jan 22, 2026
e82a6af
set log level
bxyu-nvidia Jan 22, 2026
ee0af2f
try redirect
bxyu-nvidia Jan 22, 2026
843b0cb
try print with file
bxyu-nvidia Jan 22, 2026
7ef8b07
try print again
bxyu-nvidia Jan 22, 2026
eb9baff
wrap entire call
bxyu-nvidia Jan 22, 2026
516b66f
just use std err
bxyu-nvidia Jan 22, 2026
5a0c1a2
try impl dump graph
bxyu-nvidia Jan 23, 2026
c029cb6
add print
bxyu-nvidia Jan 23, 2026
d1f72de
clean
bxyu-nvidia Jan 23, 2026
cc67ad3
add prints
bxyu-nvidia Jan 23, 2026
b085a14
try timeout and kill
bxyu-nvidia Jan 23, 2026
ea35354
revert
bxyu-nvidia Jan 23, 2026
946b8dc
clean
bxyu-nvidia Jan 23, 2026
1e2dad1
simplify get global config dict
bxyu-nvidia Jan 23, 2026
826310f
try fix serialization error
bxyu-nvidia Jan 23, 2026
f893f09
try refactor into profiler
bxyu-nvidia Jan 23, 2026
e504716
fix
bxyu-nvidia Jan 23, 2026
b7f27a9
try add profiling to instance
bxyu-nvidia Jan 23, 2026
744567f
pip ng profiling dir
bxyu-nvidia Jan 23, 2026
1cbf063
pipe
bxyu-nvidia Jan 23, 2026
6712ab2
use name
bxyu-nvidia Jan 23, 2026
17a2ce9
try switch commits
bxyu-nvidia Jan 23, 2026
24516f1
bump openhands
bxyu-nvidia Jan 23, 2026
d46d440
convert to list
bxyu-nvidia Jan 23, 2026
53c009c
try mount profiling dir
bxyu-nvidia Jan 23, 2026
9e9185e
print mount args
bxyu-nvidia Jan 23, 2026
af13f04
try reuse trajectories dir
bxyu-nvidia Jan 23, 2026
13f99cc
clean
bxyu-nvidia Jan 23, 2026
b046da5
actually print
bxyu-nvidia Jan 23, 2026
94ff7db
bump openhands
bxyu-nvidia Jan 23, 2026
ee61b6c
try dict functool to avoid serialization error
bxyu-nvidia Jan 23, 2026
c345ccc
clean
bxyu-nvidia Jan 23, 2026
6606ae4
print tool
bxyu-nvidia Jan 23, 2026
8a78997
just fail
bxyu-nvidia Jan 23, 2026
c1cb16b
clean
bxyu-nvidia Jan 23, 2026
6d7172e
try error on warnings
bxyu-nvidia Jan 23, 2026
22342ac
print metadata
bxyu-nvidia Jan 23, 2026
1b74d0e
print many newlines
bxyu-nvidia Jan 23, 2026
37312c1
dump
bxyu-nvidia Jan 23, 2026
f600cdb
feat: update oh w/ mem limt and cmd timeout
sdevare-nv Jan 23, 2026
591b409
error and prints
bxyu-nvidia Jan 23, 2026
9deffae
dont error on warnings
bxyu-nvidia Jan 24, 2026
83b5851
clean
bxyu-nvidia Jan 24, 2026
879c623
ignore pydantic serialization warnings
bxyu-nvidia Jan 24, 2026
01f7a8d
try filter by message
bxyu-nvidia Jan 24, 2026
b29b432
revert back to function tool
bxyu-nvidia Jan 24, 2026
78dc107
add comment
bxyu-nvidia Jan 24, 2026
028a9d7
add example
bxyu-nvidia Jan 24, 2026
99d1fd5
try profiling openhands
bxyu-nvidia Jan 24, 2026
cb4a040
bump
bxyu-nvidia Jan 24, 2026
1a763a7
enable logging
bxyu-nvidia Jan 24, 2026
16e8be1
feat: move copy logic to host
sdevare-nv Jan 24, 2026
9699852
revert to shared folder
bxyu-nvidia Jan 24, 2026
cc43a52
pipe debug through
bxyu-nvidia Jan 24, 2026
964a859
add apt instapll graphviz
bxyu-nvidia Jan 24, 2026
8c5f566
try apt get
bxyu-nvidia Jan 24, 2026
82414d8
remove apt install
bxyu-nvidia Jan 24, 2026
59f8a4d
dump out afterwards
bxyu-nvidia Jan 24, 2026
a3cf012
bump up pct
bxyu-nvidia Jan 26, 2026
54d95c1
increase to 5
bxyu-nvidia Jan 26, 2026
9b74f89
Merge branch 'sdd/oh-metric-block-commands' of https://github.com/NVI…
bxyu-nvidia Jan 26, 2026
a688dd6
add hits
bxyu-nvidia Jan 26, 2026
aa0b2c7
clean
bxyu-nvidia Jan 26, 2026
cbf9c35
print each
bxyu-nvidia Jan 26, 2026
d61f0ee
try model dump
bxyu-nvidia Jan 26, 2026
16085af
modeul dump again
bxyu-nvidia Jan 26, 2026
95d08c1
breakpoint
bxyu-nvidia Jan 26, 2026
0a9e25f
stderr
bxyu-nvidia Jan 26, 2026
4caf117
separate breakpoints
bxyu-nvidia Jan 26, 2026
1fc1a08
try moppdel dump
bxyu-nvidia Jan 26, 2026
0bf5460
print
bxyu-nvidia Jan 26, 2026
466690b
print type v
bxyu-nvidia Jan 26, 2026
43554f9
stderr
bxyu-nvidia Jan 26, 2026
56384c6
resolve metadata
bxyu-nvidia Jan 26, 2026
56fdd00
openhands hsould log
bxyu-nvidia Jan 26, 2026
dccd50e
pipe global config dict
bxyu-nvidia Jan 26, 2026
26dcf23
use num cpus 1
bxyu-nvidia Jan 27, 2026
92e00fd
pipe model name
bxyu-nvidia Jan 28, 2026
1d6ce57
start add profiling metrics
bxyu-nvidia Feb 3, 2026
b821243
add placeholder
bxyu-nvidia Feb 3, 2026
2f49809
hit success
bxyu-nvidia Feb 3, 2026
7e018e4
hit success
bxyu-nvidia Feb 3, 2026
2a73772
hit unknown
bxyu-nvidia Feb 3, 2026
d648e56
hit_empty_trajectory
bxyu-nvidia Feb 3, 2026
0ab3b86
hit_responses_exception
bxyu-nvidia Feb 3, 2026
769c155
plumb NEMO_GYM_METRICS_FPATH
bxyu-nvidia Feb 3, 2026
3a42aef
report time metrics
bxyu-nvidia Feb 3, 2026
40874f8
final eval time
bxyu-nvidia Feb 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions nemo_gym/profiling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from io import StringIO
from pathlib import Path
from typing import Optional

import yappi
from gprof2dot import main as gprof2dot_main
from pydantic import BaseModel
from pydot import graph_from_dot_file


class Profiler(BaseModel):
name: str
base_profile_dir: Path

# Used to clean up and filter out unnecessary information in the yappi log
required_str: Optional[str] = None

def start(self) -> None:
yappi.set_clock_type("CPU")
yappi.start()
print(f"🔍 Enabled profiling for {self.name}")

def stop(self) -> None:
print(f"🛑 Stopping profiler for {self.name}. Check {self.base_profile_dir} for the metrics!")
yappi.stop()
self.dump()

def dump(self) -> None:
self.base_profile_dir.mkdir(parents=True, exist_ok=True)
log_path = self.base_profile_dir / f"{self.name}.log"
callgrind_path = self.base_profile_dir / f"{self.name}.callgrind"
callgrind_dotfile_path = self.base_profile_dir / f"{self.name}.dot"
callgrind_graph_path = self.base_profile_dir / f"{self.name}.png"

yappi.get_func_stats().save(callgrind_path, type="CALLGRIND")
gprof2dot_main(argv=f"--format=callgrind --output={callgrind_dotfile_path} -e 5 -n 5 {callgrind_path}".split())

(graph,) = graph_from_dot_file(callgrind_dotfile_path)
graph.write_png(callgrind_graph_path)

buffer = StringIO()
yappi.get_func_stats().print_all(
out=buffer,
columns={
0: ("name", 200),
1: ("ncall", 10),
2: ("tsub", 8),
3: ("ttot", 8),
4: ("tavg", 8),
},
)

buffer.seek(0)
res = ""
past_header = False
for line in buffer:
if not past_header or (self.required_str and self.required_str in line):
res += line

if line.startswith("name"):
past_header = True

with open(log_path, "w") as f:
f.write(res)
50 changes: 10 additions & 40 deletions nemo_gym/server_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import sys
from abc import abstractmethod
from contextlib import asynccontextmanager
from io import StringIO
from logging import Filter as LoggingFilter
from logging import LogRecord, getLogger
from os import environ, getenv
Expand All @@ -33,7 +32,6 @@
import ray
import requests
import uvicorn
import yappi
from aiohttp import (
ClientResponse,
ClientResponseError,
Expand Down Expand Up @@ -67,6 +65,7 @@
get_first_server_config_dict,
get_global_config_dict,
)
from nemo_gym.profiling import Profiler


_GLOBAL_AIOHTTP_CLIENT: Union[None, ClientSession] = None
Expand Down Expand Up @@ -433,6 +432,9 @@ async def exception_handling_middleware(request: Request, call_next):
)

response_content = f"Hit an exception in {self.get_session_middleware_key()} calling an inner server: {e.response_content}"
if _GLOBAL_AIOHTTP_CLIENT_REQUEST_DEBUG:
print(response_content)

return JSONResponse(content=response_content, status_code=500)
except Exception as e:
print(
Expand All @@ -449,58 +451,26 @@ async def exception_handling_middleware(request: Request, call_next):
return JSONResponse(content="An unknown error occurred", status_code=500)

def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareConfig) -> None: # pragma: no cover
base_profile_dir = PARENT_DIR / profiling_config.profiling_results_dirpath
server_profile_path = (base_profile_dir / self.get_session_middleware_key()).with_suffix(".log")

base_profile_dir.mkdir(parents=True, exist_ok=True)
base_profile_dir = PARENT_DIR / profiling_config.profiling_results_dirpath / self.get_session_middleware_key()
profiler = Profiler(name=self.config.name, base_profile_dir=base_profile_dir)

main_app_lifespan = app.router.lifespan_context

def _dump_yappi_stats() -> str:
buffer = StringIO()
yappi.get_func_stats().print_all(
out=buffer,
columns={
0: ("name", 200),
1: ("ncall", 10),
2: ("tsub", 8),
3: ("ttot", 8),
4: ("tavg", 8),
},
)

buffer.seek(0)
res = ""
past_header = False
for line in buffer:
if not past_header or self.config.entrypoint in line:
res += line

if line.startswith("name"):
past_header = True

return res

@asynccontextmanager
async def lifespan_wrapper(app):
yappi.set_clock_type("CPU")
yappi.start()
print(f"🔍 Enabled profiling for {self.config.name}")
profiler.start()

async with main_app_lifespan(app) as maybe_state:
yield maybe_state

print(f"🛑 Stopping profiler for {self.config.name}. Check {server_profile_path} for the metrics!")
yappi.stop()

with open(server_profile_path, "w") as f:
f.write(_dump_yappi_stats())
profiler.stop()

app.router.lifespan_context = lifespan_wrapper

@app.get("/stats")
def stats():
return Response(_dump_yappi_stats())
profiler.dump()
return Response()

def set_ulimit(self, target_soft_limit: int = 65535): # pragma: no cover
# From https://github.com/vllm-project/vllm/blob/fed8a9b107df3e27d57728c6911c7d308b871477/vllm/utils/__init__.py#L2790
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ dependencies = [
# Updated: Thu Jan 08, 2026 with orjson==3.11.3
# License: Apache 2.0 https://github.com/ijl/orjson/blob/fb3eb1f729c7e7b019f780af5695722c99c7c695/LICENSE-APACHE
"orjson",

"gprof2dot",
"pydot",
]

[dependency-groups]
Expand Down
Loading
Loading