Skip to content

Commit 32acabb

Browse files
committed
Merge main into ilana/gpu-telemetry-dashboard
2 parents d0867fe + 64c50ba commit 32acabb

29 files changed

+814
-132
lines changed

.coderabbit.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
6+
# Docs: https://docs.coderabbit.ai/getting-started/configure-coderabbit/
7+
language: "en-US"
8+
early_access: false
9+
reviews:
10+
profile: "chill"
11+
request_changes_workflow: false
12+
high_level_summary: true
13+
poem: true
14+
review_status: false
15+
collapse_walkthrough: false
16+
auto_review:
17+
enabled: true
18+
drafts: false
19+
auto_incremental_review: false
20+
suggested_labels: false
21+
suggested_reviewers: false
22+
sequence_diagrams: false
23+
related_issues: false
24+
related_prs: false
25+
finishing_touches:
26+
docstrings:
27+
enabled: false
28+
unit_tests:
29+
enabled: false
30+
chat:
31+
auto_reply: true

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ Features
2727
======================
2828
-->
2929

30+
<img width="1724" height="670" alt="AIPerf UI Dashboard" src="https://github.com/user-attachments/assets/7eb40867-b1c1-4ebe-bd57-7619f2154bba" />
31+
3032
## Features
3133

3234
- Scalable via multiprocess support

aiperf/__main__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,19 @@
44
import sys
55

66
from aiperf.cli import app
7-
from aiperf.gpu_telemetry.constants import DEFAULT_DCGM_ENDPOINT
7+
from aiperf.gpu_telemetry.constants import DEFAULT_DCGM_ENDPOINTS
88

99

1010
def main() -> int:
1111
# TODO: HACK: Remove this once we can upgrade to v4 of cyclopts
1212
# This is a hack to allow the --gpu-telemetry flag to be used without a value
13-
# and it will be set to the default endpoint, which will inform the telemetry
13+
# and it will be set to the default endpoints, which will inform the telemetry
1414
# exporter to print the telemetry to the console
1515
if "--gpu-telemetry" in sys.argv:
1616
idx = sys.argv.index("--gpu-telemetry")
1717
if idx >= len(sys.argv) - 1 or sys.argv[idx + 1].startswith("-"):
18-
sys.argv.insert(idx + 1, DEFAULT_DCGM_ENDPOINT)
18+
for endpoint in reversed(DEFAULT_DCGM_ENDPOINTS):
19+
sys.argv.insert(idx + 1, endpoint)
1920
return app(sys.argv[1:])
2021

2122

aiperf/clients/http/aiohttp_client.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,18 +204,19 @@ async def __aiter__(self) -> typing.AsyncIterator[tuple[str, int]]:
204204
if not first_byte:
205205
break
206206

207+
# Read until we hit \n\n which delimits SSE messages
207208
chunk = await self.response.content.readuntil(b"\n\n")
208209

209210
if not chunk:
210211
break
211212
chunk = first_byte + chunk
212213

213214
try:
215+
decoded = chunk.decode("utf-8")
216+
for sub_chunk in decoded.split("\n\n"):
217+
if sub_chunk:
218+
yield (sub_chunk, chunk_ns_first_byte)
214219
# Use the fastest available decoder
215-
yield (
216-
chunk.decode("utf-8").strip(),
217-
chunk_ns_first_byte,
218-
)
219220
except UnicodeDecodeError:
220221
# Handle potential encoding issues gracefully
221222
yield (

aiperf/clients/model_endpoint_info.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,11 +138,17 @@ def from_user_config(cls, user_config: UserConfig) -> "ModelEndpointInfo":
138138
def url(self) -> str:
139139
"""Get the full URL for the endpoint."""
140140
url = self.endpoint.base_url.rstrip("/") if self.endpoint.base_url else ""
141+
141142
if self.endpoint.custom_endpoint:
142-
url += "/" + self.endpoint.custom_endpoint.lstrip("/")
143-
elif path := self.endpoint.type.endpoint_path:
144-
url += "/" + path.lstrip("/")
145-
return url
143+
path = self.endpoint.custom_endpoint.lstrip("/")
144+
else:
145+
if not self.endpoint.type.endpoint_path:
146+
return url
147+
path = self.endpoint.type.endpoint_path.lstrip("/")
148+
if url.endswith("/v1") and path.startswith("v1/"):
149+
path = path[3:] # Remove the v1/ prefix
150+
151+
return f"{url}/{path}"
146152

147153
@property
148154
def primary_model(self) -> ModelInfo:

aiperf/common/config/input_config.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,19 @@ def validate_goodput(self) -> Self:
7979
Runs after the model is constructed so we can inspect self.goodput directly.
8080
"""
8181
if self.goodput:
82+
from aiperf.common.enums import MetricType
8283
from aiperf.metrics.metric_registry import MetricRegistry
8384

8485
for tag in self.goodput:
8586
try:
86-
MetricRegistry.get_class(tag)
87+
metric_cls = MetricRegistry.get_class(tag)
8788
except MetricTypeError as e:
8889
raise ValueError(f"Unknown metric tag in --goodput: {tag}") from e
90+
if metric_cls.type == MetricType.DERIVED:
91+
raise ValueError(
92+
f"Metric '{tag}' is a Derived metric and cannot be used for --goodput. "
93+
"Use a per-record metric instead (e.g., 'inter_token_latency', 'time_to_first_token')."
94+
)
8995

9096
return self
9197

aiperf/common/config/user_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def _count_dataset_entries(self) -> int:
215215
list[str] | None,
216216
Field(
217217
default=None,
218-
description="Enable GPU telemetry console display and optionally specify custom DCGM exporter URLs (e.g., http://node1:9401/metrics http://node2:9401/metrics). Default localhost:9401 is always attempted",
218+
description="Enable GPU telemetry console display and optionally specify custom DCGM exporter URLs (e.g., http://node1:9401/metrics http://node2:9401/metrics). Default localhost:9400 and localhost:9401 are always attempted",
219219
),
220220
BeforeValidator(parse_str_or_list),
221221
CLIParameter(

aiperf/common/messages/telemetry_messages.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ class TelemetryStatusMessage(BaseServiceMessage):
6666
reason: str | None = Field(
6767
default=None, description="Reason why telemetry is disabled (if enabled=False)"
6868
)
69-
endpoints_tested: list[str] = Field(
69+
endpoints_configured: list[str] = Field(
7070
default_factory=list,
71-
description="List of DCGM endpoint URLs that were tested for reachability",
71+
description="List of DCGM endpoint URLs in the configured scope for display",
7272
)
7373
endpoints_reachable: list[str] = Field(
7474
default_factory=list,

aiperf/common/models/export_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class JsonMetricResult(AIPerfBaseModel):
4141
class TelemetrySummary(AIPerfBaseModel):
4242
"""Summary information for telemetry collection."""
4343

44-
endpoints_tested: list[str]
44+
endpoints_configured: list[str]
4545
endpoints_successful: list[str]
4646
start_time: datetime
4747
end_time: datetime

aiperf/common/models/telemetry_models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -335,9 +335,9 @@ class TelemetryResults(AIPerfBaseModel):
335335
description="Start time of telemetry collection in nanoseconds"
336336
)
337337
end_ns: int = Field(description="End time of telemetry collection in nanoseconds")
338-
endpoints_tested: list[str] = Field(
338+
endpoints_configured: list[str] = Field(
339339
default_factory=list,
340-
description="List of DCGM endpoint URLs that were tested for reachability",
340+
description="List of DCGM endpoint URLs in configured scope for display",
341341
)
342342
endpoints_successful: list[str] = Field(
343343
default_factory=list,

0 commit comments

Comments
 (0)