Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions dashboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
)

# Initialize session state
if "data_loader" not in st.session_state:
if "ni" not in st.session_state:
st.session_state.data_loader = InfiniMetricsDataLoader()
if "selected_accelerators" not in st.session_state:
st.session_state.selected_accelerators = []
Expand Down Expand Up @@ -96,19 +96,22 @@ def render_dashboard(run_id_filter: str):
<div style="
margin-top: 0.5em;
margin-bottom: 1.5em;
max-width: 1100px;
font-size: 1.05em;
max-width: 80%;
font-size: 1.3em;
line-height: 1.6;
">
<strong>InfiniMetrics Dashboard</strong> 用于统一展示
<strong>通信(NCCL / 集合通信)</strong>、
<strong>推理(Direct / Service)</strong>、
<strong>训练(Training / 分布式训练)</strong>、
<strong>推理(Direct / Service 推理)</strong>、
<strong>算子(核心算子性能)</strong>
等 AI 加速卡性能测试结果。
<br/>
测试框架输出 <code>JSON</code>(环境 / 配置 / 标量指标) +
<code>CSV</code>(曲线 / 时序数据),
Dashboard 自动加载并支持多次运行的对比分析与可视化。
Dashboard 自动加载并支持多次运行的
<strong>性能对比</strong>、<strong>趋势分析</strong> 与
<strong>可视化展示</strong>。
</div>
""",
unsafe_allow_html=True,
Expand Down Expand Up @@ -150,6 +153,7 @@ def _parse_time(t):
# ========== Categorize runs ==========
comm_runs = [r for r in runs if r.get("testcase", "").startswith("comm")]
infer_runs = [r for r in runs if r.get("testcase", "").startswith("infer")]
train_runs = [r for r in runs if r.get("testcase", "").startswith("train")]

ops_runs, hw_runs = [], []
for r in runs:
Expand All @@ -161,13 +165,14 @@ def _parse_time(t):
hw_runs.append(r)

# ========== KPI ==========
c1, c2, c3, c4, c5, c6 = st.columns(6)
c1, c2, c3, c4, c5, c6, c7 = st.columns(7)
c1.metric("总测试数", total)
c2.metric("成功率", f"{(success/total*100):.1f}%")
c3.metric("通信测试", len(comm_runs))
c4.metric("推理测试", len(infer_runs))
c5.metric("算子测试", len(ops_runs))
c6.metric("硬件检测", len(hw_runs))
c5.metric("训练测试", len(train_runs))
c6.metric("算子测试", len(ops_runs))
c7.metric("硬件检测", len(hw_runs))

st.caption(f"失败测试数:{fail}")
st.caption(f"当前筛选:加速卡={','.join(selected_accs) or '全部'}")
Expand All @@ -181,8 +186,9 @@ def _latest(lst):
latest_comm = _latest(comm_runs)
latest_infer = _latest(infer_runs)
latest_ops = _latest(ops_runs)
latest_train = _latest(train_runs)

colA, colB, colC = st.columns(3)
colA, colB, colC, colD = st.columns(4)

with colA:
st.markdown("#### 🔗 通信(最新)")
Expand Down Expand Up @@ -211,6 +217,17 @@ def _latest(lst):
st.write(f"- time: {latest_ops.get('time','')}")
st.write(f"- status: {'✅' if latest_ops.get('success') else '❌'}")

with colD:
st.markdown("#### 🏋️ 训练(最新)")
if not latest_train:
st.info("暂无训练结果")
else:
framework = latest_train.get("config", {}).get("framework", "unknown")
model = latest_train.get("config", {}).get("model", "unknown")
st.write(f"- 框架/模型: `{framework}/{model}`")
st.write(f"- time: {latest_train.get('time','')}")
st.write(f"- status: {'✅' if latest_train.get('success') else '❌'}")

st.divider()

# ========== Recent runs table ==========
Expand Down Expand Up @@ -267,13 +284,15 @@ def _latest(lst):
st.markdown("---")
st.markdown("### 🚀 快速导航")

col1, col2, col3 = st.columns(3)
col1, col2, col3, col4 = st.columns(4)
if col1.button("🔗 通信测试分析", use_container_width=True):
st.switch_page("pages/communication.py")
if col2.button("⚡ 算子测试分析", use_container_width=True):
st.switch_page("pages/operator.py")
if col3.button("🤖 推理测试分析", use_container_width=True):
if col3.button("🚀 推理测试分析", use_container_width=True):
st.switch_page("pages/inference.py")
if col4.button("🏋️ 训练测试分析", use_container_width=True):
st.switch_page("pages/training.py")

except Exception as e:
st.error(f"Dashboard 加载失败: {e}")
Expand Down
72 changes: 50 additions & 22 deletions dashboard/pages/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
create_summary_table_infer,
)

init_page("推理测试分析 | InfiniMetrics", "🔗")
init_page("通信测试分析 | InfiniMetrics", "🔗")


def main():
Expand Down Expand Up @@ -58,7 +58,7 @@ def main():
# Status filter
show_success = st.checkbox("仅显示成功测试", value=True)

# Apply filters
# Apply filter
filtered_runs = [
r
for r in comm_runs
Expand Down Expand Up @@ -119,6 +119,7 @@ def main():
run_info = filtered_runs[idx]
result = st.session_state.data_loader.load_test_result(run_info["path"])
run_info["data"] = result

selected_runs.append(run_info)

# Tabs for different views
Expand Down Expand Up @@ -181,26 +182,53 @@ def main():
if len(selected_runs) == 1:
st.markdown("#### 📌 核心指标(最新)")
run = selected_runs[0]
core = extract_core_metrics(run)

max_bw = None
avg_lat = None
duration = None

for metric in run.get("data", {}).get("metrics", []):
metric_name = metric.get("name", "")

# bandwidth
if (
metric_name == "comm.bandwidth"
and metric.get("data") is not None
):
df = metric["data"]
if "bandwidth_gbs" in df.columns:
max_bw = df["bandwidth_gbs"].max()

# latency
elif (
metric_name == "comm.latency" and metric.get("data") is not None
):
df = metric["data"]
if "latency_us" in df.columns:
avg_lat = df["latency_us"].mean()

# duration
elif metric_name == "comm.duration":
duration = metric.get("value")
c1, c2, c3 = st.columns(3)

c1.metric(
"峰值带宽",
(
f"{core['bandwidth_gbps']:.2f} GB/s"
if core["bandwidth_gbps"]
else "-"
),
)
c2.metric(
"平均延迟",
f"{core['latency_us']:.2f} μs" if core["latency_us"] else "-",
)
c3.metric(
"测试耗时",
f"{core['duration_ms']:.2f} ms" if core["duration_ms"] else "-",
)
with c1:
if max_bw is not None and max_bw > 0:
st.metric("峰值带宽", f"{max_bw:.2f} GB/s")
else:
st.metric("峰值带宽", "-")

with c2:
if avg_lat is not None and avg_lat > 0:
st.metric("平均延迟", f"{avg_lat:.2f} μs")
else:
st.metric("平均延迟", "-")

with c3:
if duration is not None and duration > 0:
st.metric("测试耗时", f"{duration:.2f} ms")
else:
st.metric("测试耗时", "-")
# Gauge charts for key metrics
if len(selected_runs) == 1:
st.markdown("#### 关键指标")
Expand All @@ -221,7 +249,7 @@ def main():
max_bw = df["bandwidth_gbs"].max()
fig = create_gauge_chart(
max_bw,
300, # Theoretical max for A100 NVLink
300,
"峰值带宽",
"blue",
"GB/s",
Expand All @@ -242,7 +270,7 @@ def main():
avg_lat = df["latency_us"].mean()
fig = create_gauge_chart(
avg_lat,
1000, # Reference: 1000 µs
1000,
"平均延迟",
"red",
"µs",
Expand All @@ -261,7 +289,7 @@ def main():
if duration > 0:
fig = create_gauge_chart(
duration,
duration * 2, # Scale to show progress
duration * 2,
"测试耗时",
"green",
"ms",
Expand Down
4 changes: 2 additions & 2 deletions dashboard/pages/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
create_summary_table_infer,
)

init_page("推理测试分析 | InfiniMetrics", "🤖")
init_page("推理测试分析 | InfiniMetrics", "🚀")


def main():
Expand Down Expand Up @@ -178,7 +178,7 @@ def _plot_metric(metric_name_contains: str, container):

_plot_metric("infer.compute_latency", c1)
_plot_metric("infer.ttft", c2)
_plot_metric("infer.direct_throughput", c3)
_plot_metric("infer.direct_throughput_tps", c3)

# ---------- Tables ----------
with tab2:
Expand Down
Loading