InfiniTensor · Chamberlain0w0 · Mar 17, 2026 · Mar 11, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/dashboard/app.py b/dashboard/app.py
@@ -24,7 +24,7 @@
 )
 
 # Initialize session state
-if "data_loader" not in st.session_state:
+if "ni" not in st.session_state:
     st.session_state.data_loader = InfiniMetricsDataLoader()
 if "selected_accelerators" not in st.session_state:
     st.session_state.selected_accelerators = []
@@ -96,19 +96,22 @@ def render_dashboard(run_id_filter: str):
         <div style="
             margin-top: 0.5em;
             margin-bottom: 1.5em;
-            max-width: 1100px;
-            font-size: 1.05em;
+            max-width: 80%;
+            font-size: 1.3em;
             line-height: 1.6;
         ">
             <strong>InfiniMetrics Dashboard</strong> 用于统一展示
             <strong>通信（NCCL / 集合通信）</strong>、
-            <strong>推理（Direct / Service）</strong>、
+            <strong>训练（Training / 分布式训练）</strong>、
+            <strong>推理（Direct / Service 推理）</strong>、
             <strong>算子（核心算子性能）</strong>
             等 AI 加速卡性能测试结果。
             <br/>
             测试框架输出 <code>JSON</code>（环境 / 配置 / 标量指标） +
             <code>CSV</code>（曲线 / 时序数据），
-            Dashboard 自动加载并支持多次运行的对比分析与可视化。
+            Dashboard 自动加载并支持多次运行的
+            <strong>性能对比</strong>、<strong>趋势分析</strong> 与
+            <strong>可视化展示</strong>。
         </div>
         """,
         unsafe_allow_html=True,
@@ -150,6 +153,7 @@ def _parse_time(t):
         # ========== Categorize runs ==========
         comm_runs = [r for r in runs if r.get("testcase", "").startswith("comm")]
         infer_runs = [r for r in runs if r.get("testcase", "").startswith("infer")]
+        train_runs = [r for r in runs if r.get("testcase", "").startswith("train")]
 
         ops_runs, hw_runs = [], []
         for r in runs:
@@ -161,13 +165,14 @@ def _parse_time(t):
                 hw_runs.append(r)
 
         # ========== KPI ==========
-        c1, c2, c3, c4, c5, c6 = st.columns(6)
+        c1, c2, c3, c4, c5, c6, c7 = st.columns(7)
         c1.metric("总测试数", total)
         c2.metric("成功率", f"{(success/total*100):.1f}%")
         c3.metric("通信测试", len(comm_runs))
         c4.metric("推理测试", len(infer_runs))
-        c5.metric("算子测试", len(ops_runs))
-        c6.metric("硬件检测", len(hw_runs))
+        c5.metric("训练测试", len(train_runs))
+        c6.metric("算子测试", len(ops_runs))
+        c7.metric("硬件检测", len(hw_runs))
 
         st.caption(f"失败测试数：{fail}")
         st.caption(f"当前筛选：加速卡={','.join(selected_accs) or '全部'}")
@@ -181,8 +186,9 @@ def _latest(lst):
         latest_comm = _latest(comm_runs)
         latest_infer = _latest(infer_runs)
         latest_ops = _latest(ops_runs)
+        latest_train = _latest(train_runs)
 
-        colA, colB, colC = st.columns(3)
+        colA, colB, colC, colD = st.columns(4)
 
         with colA:
             st.markdown("#### 🔗 通信（最新）")
@@ -211,6 +217,17 @@ def _latest(lst):
                 st.write(f"- time: {latest_ops.get('time','')}")
                 st.write(f"- status: {'✅' if latest_ops.get('success') else '❌'}")
 
+        with colD:
+            st.markdown("#### 🏋️ 训练（最新）")
+            if not latest_train:
+                st.info("暂无训练结果")
+            else:
+                framework = latest_train.get("config", {}).get("framework", "unknown")
+                model = latest_train.get("config", {}).get("model", "unknown")
+                st.write(f"- 框架/模型: `{framework}/{model}`")
+                st.write(f"- time: {latest_train.get('time','')}")
+                st.write(f"- status: {'✅' if latest_train.get('success') else '❌'}")
+
         st.divider()
 
         # ========== Recent runs table ==========
@@ -267,13 +284,15 @@ def _latest(lst):
         st.markdown("---")
         st.markdown("### 🚀 快速导航")
 
-        col1, col2, col3 = st.columns(3)
+        col1, col2, col3, col4 = st.columns(4)
         if col1.button("🔗 通信测试分析", use_container_width=True):
             st.switch_page("pages/communication.py")
         if col2.button("⚡ 算子测试分析", use_container_width=True):
             st.switch_page("pages/operator.py")
-        if col3.button("🤖 推理测试分析", use_container_width=True):
+        if col3.button("🚀 推理测试分析", use_container_width=True):
             st.switch_page("pages/inference.py")
+        if col4.button("🏋️ 训练测试分析", use_container_width=True):
+            st.switch_page("pages/training.py")
 
     except Exception as e:
         st.error(f"Dashboard 加载失败: {e}")

diff --git a/dashboard/pages/communication.py b/dashboard/pages/communication.py
@@ -17,7 +17,7 @@
     create_summary_table_infer,
 )
 
-init_page("推理测试分析 | InfiniMetrics", "🔗")
+init_page("通信测试分析 | InfiniMetrics", "🔗")
 
 
 def main():
@@ -58,7 +58,7 @@ def main():
             # Status filter
             show_success = st.checkbox("仅显示成功测试", value=True)
 
-            # Apply filters
+            # Apply filter
             filtered_runs = [
                 r
                 for r in comm_runs
@@ -119,6 +119,7 @@ def main():
             run_info = filtered_runs[idx]
             result = st.session_state.data_loader.load_test_result(run_info["path"])
             run_info["data"] = result
+
             selected_runs.append(run_info)
 
         # Tabs for different views
@@ -181,26 +182,53 @@ def main():
             if len(selected_runs) == 1:
                 st.markdown("#### 📌 核心指标（最新）")
                 run = selected_runs[0]
-                core = extract_core_metrics(run)
 
+                max_bw = None
+                avg_lat = None
+                duration = None
+
+                for metric in run.get("data", {}).get("metrics", []):
+                    metric_name = metric.get("name", "")
+
+                    # bandwidth
+                    if (
+                        metric_name == "comm.bandwidth"
+                        and metric.get("data") is not None
+                    ):
+                        df = metric["data"]
+                        if "bandwidth_gbs" in df.columns:
+                            max_bw = df["bandwidth_gbs"].max()
+
+                    # latency
+                    elif (
+                        metric_name == "comm.latency" and metric.get("data") is not None
+                    ):
+                        df = metric["data"]
+                        if "latency_us" in df.columns:
+                            avg_lat = df["latency_us"].mean()
+
+                    # duration
+                    elif metric_name == "comm.duration":
+                        duration = metric.get("value")
                 c1, c2, c3 = st.columns(3)
 
-                c1.metric(
-                    "峰值带宽",
-                    (
-                        f"{core['bandwidth_gbps']:.2f} GB/s"
-                        if core["bandwidth_gbps"]
-                        else "-"
-                    ),
-                )
-                c2.metric(
-                    "平均延迟",
-                    f"{core['latency_us']:.2f} μs" if core["latency_us"] else "-",
-                )
-                c3.metric(
-                    "测试耗时",
-                    f"{core['duration_ms']:.2f} ms" if core["duration_ms"] else "-",
-                )
+                with c1:
+                    if max_bw is not None and max_bw > 0:
+                        st.metric("峰值带宽", f"{max_bw:.2f} GB/s")
+                    else:
+                        st.metric("峰值带宽", "-")
+
+                with c2:
+                    if avg_lat is not None and avg_lat > 0:
+                        st.metric("平均延迟", f"{avg_lat:.2f} μs")
+                    else:
+                        st.metric("平均延迟", "-")
+
+                with c3:
+                    if duration is not None and duration > 0:
+                        st.metric("测试耗时", f"{duration:.2f} ms")
+                    else:
+                        st.metric("测试耗时", "-")
             # Gauge charts for key metrics
             if len(selected_runs) == 1:
                 st.markdown("#### 关键指标")
@@ -221,7 +249,7 @@ def main():
                                 max_bw = df["bandwidth_gbs"].max()
                                 fig = create_gauge_chart(
                                     max_bw,
-                                    300,  # Theoretical max for A100 NVLink
+                                    300,
                                     "峰值带宽",
                                     "blue",
                                     "GB/s",
@@ -242,7 +270,7 @@ def main():
                                 avg_lat = df["latency_us"].mean()
                                 fig = create_gauge_chart(
                                     avg_lat,
-                                    1000,  # Reference: 1000 µs
+                                    1000,
                                     "平均延迟",
                                     "red",
                                     "µs",
@@ -261,7 +289,7 @@ def main():
                     if duration > 0:
                         fig = create_gauge_chart(
                             duration,
-                            duration * 2,  # Scale to show progress
+                            duration * 2,
                             "测试耗时",
                             "green",
                             "ms",

diff --git a/dashboard/pages/inference.py b/dashboard/pages/inference.py
@@ -12,7 +12,7 @@
     create_summary_table_infer,
 )
 
-init_page("推理测试分析 | InfiniMetrics", "🤖")
+init_page("推理测试分析 | InfiniMetrics", "🚀")
 
 
 def main():
@@ -178,7 +178,7 @@ def _plot_metric(metric_name_contains: str, container):
 
         _plot_metric("infer.compute_latency", c1)
         _plot_metric("infer.ttft", c2)
-        _plot_metric("infer.direct_throughput", c3)
+        _plot_metric("infer.direct_throughput_tps", c3)
 
     # ---------- Tables ----------
     with tab2: