Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 29 additions & 10 deletions dashboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def main():
st.markdown("---")

results_dir = st.text_input(
"测试结果目录", value="../output", help="包含 JSON/CSV 测试结果的目录"
"测试结果目录", value="./output", help="包含 JSON/CSV 测试结果的目录"
)

if not use_mongodb and results_dir != str(
Expand Down Expand Up @@ -122,20 +122,23 @@ def render_dashboard(run_id_filter: str):
<div style="
margin-top: 0.5em;
margin-bottom: 1.5em;
max-width: 1100px;
font-size: 1.05em;
max-width: 80%;
font-size: 1.3em;
line-height: 1.6;
">
<strong>InfiniMetrics Dashboard</strong> 用于统一展示
<strong>通信(NCCL / 集合通信)</strong>、
<strong>训练(Training / 分布式训练)</strong>、
<strong>推理(直接推理 / 服务性能)</strong>、
<strong>算子(核心算子性能)</strong>、
<strong>硬件(内存带宽 / 缓存性能)</strong>
等 AI 加速卡性能测试结果。
<br/>
测试框架输出 <code>JSON</code>(环境 / 配置 / 标量指标) +
<code>CSV</code>(曲线 / 时序数据),
Dashboard 自动加载并支持多次运行的对比分析与可视化。
Dashboard 自动加载并支持多次运行的
<strong>性能对比</strong>、<strong>趋势分析</strong> 与
<strong>可视化展示</strong>。
</div>
""",
unsafe_allow_html=True,
Expand Down Expand Up @@ -177,6 +180,7 @@ def _parse_time(t):
# ========== Categorize runs ==========
comm_runs = [r for r in runs if r.get("testcase", "").startswith("comm")]
infer_runs = [r for r in runs if r.get("testcase", "").startswith("infer")]
train_runs = [r for r in runs if r.get("testcase", "").startswith("train")]

ops_runs, hw_runs = [], []
for r in runs:
Expand All @@ -188,13 +192,14 @@ def _parse_time(t):
hw_runs.append(r)

# ========== KPI ==========
c1, c2, c3, c4, c5, c6 = st.columns(6)
c1, c2, c3, c4, c5, c6, c7 = st.columns(7)
c1.metric("总测试数", total)
c2.metric("成功率", f"{(success/total*100):.1f}%")
c3.metric("通信测试", len(comm_runs))
c4.metric("推理测试", len(infer_runs))
c5.metric("算子测试", len(ops_runs))
c6.metric("硬件检测", len(hw_runs))
c5.metric("训练测试", len(train_runs))
c6.metric("算子测试", len(ops_runs))
c7.metric("硬件检测", len(hw_runs))

st.caption(f"失败测试数:{fail}")
st.caption(f"当前筛选:加速卡={','.join(selected_accs) or '全部'}")
Expand All @@ -208,8 +213,9 @@ def _latest(lst):
latest_comm = _latest(comm_runs)
latest_infer = _latest(infer_runs)
latest_ops = _latest(ops_runs)
latest_train = _latest(train_runs)

colA, colB, colC = st.columns(3)
colA, colB, colC, colD = st.columns(4)

with colA:
st.markdown("#### 🔗 通信(最新)")
Expand Down Expand Up @@ -238,6 +244,17 @@ def _latest(lst):
st.write(f"- time: {latest_ops.get('time','')}")
st.write(f"- status: {'✅' if latest_ops.get('success') else '❌'}")

with colD:
st.markdown("#### 🏋️ 训练(最新)")
if not latest_train:
st.info("暂无训练结果")
else:
framework = latest_train.get("config", {}).get("framework", "unknown")
model = latest_train.get("config", {}).get("model", "unknown")
st.write(f"- 框架/模型: `{framework}/{model}`")
st.write(f"- time: {latest_train.get('time','')}")
st.write(f"- status: {'✅' if latest_train.get('success') else '❌'}")

st.divider()

# ========== Recent runs table ==========
Expand Down Expand Up @@ -294,13 +311,15 @@ def _latest(lst):
st.markdown("---")
st.markdown("### 🚀 快速导航")

col1, col2, col3 = st.columns(3)
col1, col2, col3, col4 = st.columns(4)
if col1.button("🔗 通信测试分析", use_container_width=True):
st.switch_page("pages/communication.py")
if col2.button("⚡ 算子测试分析", use_container_width=True):
st.switch_page("pages/operator.py")
if col3.button("🤖 推理测试分析", use_container_width=True):
if col3.button("🚀 推理测试分析", use_container_width=True):
st.switch_page("pages/inference.py")
if col4.button("🏋️ 训练测试分析", use_container_width=True):
st.switch_page("pages/training.py")

except Exception as e:
st.error(f"Dashboard 加载失败: {e}")
Expand Down
35 changes: 15 additions & 20 deletions dashboard/pages/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def main():
# Status filter
show_success = st.checkbox("仅显示成功测试", value=True)

# Apply filters
# Apply filter
filtered_runs = [
r
for r in comm_runs
Expand Down Expand Up @@ -123,6 +123,7 @@ def main():
identifier = run_info.get("path") or run_info.get("run_id")
result = st.session_state.data_loader.load_test_result(identifier)
run_info["data"] = result

selected_runs.append(run_info)

# Tabs for different views
Expand Down Expand Up @@ -183,36 +184,30 @@ def main():
st.plotly_chart(fig, use_container_width=True)

if len(selected_runs) == 1:
st.markdown("#### 📌 核心指标(最新)")
st.markdown("#### 关键指标")
run = selected_runs[0]
core = extract_core_metrics(run)

c1, c2, c3 = st.columns(3)

c1.metric(
# First Line: numerical indicators
cols = st.columns(3)
cols[0].metric(
"峰值带宽",
(
f"{core['bandwidth_gbps']:.2f} GB/s"
if core["bandwidth_gbps"]
else "-"
),
f"{core['bandwidth_gbps']:.2f} GB/s"
if core["bandwidth_gbps"]
else "-",
)
c2.metric(
cols[1].metric(
"平均延迟",
f"{core['latency_us']:.2f} μs" if core["latency_us"] else "-",
)
c3.metric(
cols[2].metric(
"测试耗时",
f"{core['duration_ms']:.2f} ms" if core["duration_ms"] else "-",
)
# Gauge charts for key metrics
if len(selected_runs) == 1:
st.markdown("#### 关键指标")
run = selected_runs[0]

col1, col2, col3 = st.columns(3)
cols = st.columns(3)

with col1:
with cols[0]:
# Find max bandwidth
max_bw = 0
for metric in run.get("data", {}).get("metrics", []):
Expand All @@ -233,7 +228,7 @@ def main():
st.plotly_chart(fig, use_container_width=True)
break

with col2:
with cols[1]:
# Find average latency
avg_lat = 0
for metric in run.get("data", {}).get("metrics", []):
Expand All @@ -254,7 +249,7 @@ def main():
st.plotly_chart(fig, use_container_width=True)
break

with col3:
with cols[2]:
# Extract duration
duration = 0
for metric in run.get("data", {}).get("metrics", []):
Expand Down
4 changes: 2 additions & 2 deletions dashboard/pages/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
create_summary_table_infer,
)

init_page("推理测试分析 | InfiniMetrics", "🤖")
init_page("推理测试分析 | InfiniMetrics", "🚀")


def main():
Expand Down Expand Up @@ -180,7 +180,7 @@ def _plot_metric(metric_name_contains: str, container):

_plot_metric("infer.compute_latency", c1)
_plot_metric("infer.ttft", c2)
_plot_metric("infer.direct_throughput", c3)
_plot_metric("infer.direct_throughput_tps", c3)

# ---------- Tables ----------
with tab2:
Expand Down
94 changes: 94 additions & 0 deletions dashboard/pages/training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""Training tests analysis page."""

import streamlit as st

from common import init_page
from components.header import render_header
from utils.training_utils import (
load_training_runs,
filter_runs,
create_run_options,
load_selected_runs,
create_training_summary,
)
from utils.training_plots import (
render_performance_curves,
render_throughput_comparison,
render_data_tables,
render_config_details,
)

init_page("训练测试分析 | InfiniMetrics", "🏋️")


def main():
render_header()
st.markdown("## 🏋️ 训练性能测试分析")

dl = st.session_state.data_loader
runs = load_training_runs(dl)

if not runs:
st.info("未找到训练测试结果\n请将训练测试结果放在 output/train/ 或 output/training/ 目录下")
return

# Sidebar Filters
with st.sidebar:
st.markdown("### 🔍 筛选条件")

frameworks = sorted(
{r.get("config", {}).get("framework", "unknown") for r in runs}
)
models = sorted({r.get("config", {}).get("model", "unknown") for r in runs})
device_counts = sorted({r.get("device_used", 1) for r in runs})

selected_fw = st.multiselect("框架", frameworks, default=frameworks)
selected_models = st.multiselect("模型", models, default=models)
selected_dev = st.multiselect("设备数", device_counts, default=device_counts)
only_success = st.checkbox("仅显示成功测试", value=True)

st.markdown("---")
st.markdown("### 📈 图表选项")
y_log = st.checkbox("Y轴对数刻度", value=False)
smoothing = st.slider("平滑窗口", 1, 50, 5, help="对曲线进行移动平均平滑")

# Apply filters
filtered = filter_runs(
runs, selected_fw, selected_models, selected_dev, only_success
)
st.caption(f"找到 {len(filtered)} 个训练测试")

if not filtered:
st.warning("没有符合条件的测试结果")
return

# Run Selection
options = create_run_options(filtered)
selected = st.multiselect(
"选择要分析的测试运行(可多选对比)",
list(options.keys()),
default=list(options.keys())[: min(3, len(options))],
)

if not selected:
return

# Load selected runs
selected_runs = load_selected_runs(dl, filtered, options, selected)

# Tabs
tab1, tab2, tab3, tab4 = st.tabs(["📈 性能曲线", "📊 吞吐量对比", "📋 数据表格", "🔍 详细配置"])

with tab1:
render_performance_curves(selected_runs, smoothing, y_log)
with tab2:
render_throughput_comparison(selected_runs)
with tab3:
render_data_tables(selected_runs)
with tab4:
render_config_details(selected_runs, create_training_summary)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion dashboard/utils/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class InfiniMetricsDataLoader:

def __init__(
self,
results_dir: str = "../output",
results_dir: str = "./output",
use_mongodb: bool = False,
mongo_config=None,
fallback_to_files: bool = True,
Expand Down
2 changes: 1 addition & 1 deletion dashboard/utils/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def source_type(self) -> str:
class FileDataSource(DataSource):
"""File-based data source (reads from JSON/CSV files)."""

def __init__(self, results_dir: str = "../output"):
def __init__(self, results_dir: str = "./output"):
self.results_dir = Path(results_dir)

@property
Expand Down
Loading
Loading