From 9ccd4f5031d8079692d8a32a5ebbb879ca156c66 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Mon, 27 Apr 2026 21:45:16 +0800
Subject: [PATCH 01/20] add explorer perf suite

---
 perf/scripts/explorer/README.md    | 374 +++++++++++++++++++++++++++++
 perf/scripts/explorer/example.yaml |  69 ++++++
 perf/scripts/explorer/main.py      | 218 +++++++++++++++++
 pyproject.toml                     |   2 +
 trinity/perf/__init__.py           |  17 ++
 trinity/perf/explorer_metrics.py   | 114 +++++++++
 trinity/perf/report_utils.py       |  59 +++++
 trinity/perf/resource_backends.py  | 119 +++++++++
 trinity/perf/resource_sampler.py   |  69 ++++++
 9 files changed, 1041 insertions(+)
 create mode 100644 perf/scripts/explorer/README.md
 create mode 100644 perf/scripts/explorer/example.yaml
 create mode 100644 perf/scripts/explorer/main.py
 create mode 100644 trinity/perf/__init__.py
 create mode 100644 trinity/perf/explorer_metrics.py
 create mode 100644 trinity/perf/report_utils.py
 create mode 100644 trinity/perf/resource_backends.py
 create mode 100644 trinity/perf/resource_sampler.py

diff --git a/perf/scripts/explorer/README.md b/perf/scripts/explorer/README.md
new file mode 100644
index 00000000000..88681d2f8b9
--- /dev/null
+++ b/perf/scripts/explorer/README.md
@@ -0,0 +1,374 @@
+# Explorer 性能评测工具
+
+## 目标
+
+该工具用于评测 Trinity-RFT 中 Explorer 模块的运行性能，不关注模型训练效果，也不复用 Trinity 主流程中的 benchmark 或 eval 语义。
+
+第一版的设计目标如下：
+
+1. 不修改 Trinity 主流程代码，只在 perf 目录下独立实现。
+2. 资源采集能力可独立复用，后续可直接供 trainer perf 使用。
+3. 吞吐量和任务平均完成时间优先复用 Explorer 已有 step metrics。
+4. 同时提供全局汇总指标和 step 级指标。
+5. 单次运行完成后直接输出汇总结果，不考虑 warmup 和多轮重复实验。
+
+## 统计范围
+
+该工具计划统计以下性能指标：
+
+1. 初始化时间：从开始启动到初始化完成的时间。
+2. 吞吐量：单位时间内完成的任务数量，单位为 task / min。
+3. 每个任务的平均完成时间：单位为 sec / task。
+4. 资源使用情况：CPU、GPU 利用率、内存以及 GPU 显存使用情况，按固定间隔采样，并保留时序序列。
+5. step 级性能：每个 step 的任务完成数、吞吐量、平均任务耗时以及对应原始 rollout 指标。
+
+## 边界约束
+
+该工具的边界约束如下：
+
+1. 不修改 [trinity/explorer/explorer.py](/root/Trinity-RFT/trinity/explorer/explorer.py) 或其他主流程模块。
+2. 不把资源采集逻辑塞进现有 monitor 框架。
+3. 不将训练效果评估类的 benchmark 或 eval 指标混入性能结果。
+4. 不做 warmup、对照实验或多次重复取均值。
+5. 第一版默认运行环境具备可用 GPU，不额外兼容无 GPU 场景。
+6. 第一版优先支持单次本地运行和结果落盘。
+
+## 设计概览
+
+整体思路是把 Explorer perf 拆成两条完全独立的链路：
+
+1. 运行链路：启动 Explorer，统计启动耗时和总运行时间。
+2. 观测链路：在外部独立采集系统资源，并从 TensorBoard 文件读取 step metrics。
+
+其中：
+
+1. 资源数据来自 perf 下的独立采样模块。
+2. 吞吐量和任务平均完成时间来自 Explorer monitor 产出的 step metrics。
+3. 汇总逻辑在 perf 脚本内完成，不侵入 Trinity 现有实现。
+
+## 推荐目录结构
+
+建议按如下方式组织代码：
+
+```text
+perf/
+  scripts/
+    explorer/
+      README.md
+      example.yaml
+      main.py
+trinity/
+  perf/
+    __init__.py
+    explorer_metrics.py
+    report_utils.py
+    resource_backends.py
+    resource_sampler.py
+```
+
+各文件职责建议如下：
+
+1. `trinity/perf/resource_backends.py`
+   封装资源采集后端，例如 `psutil` 和 `pynvml`。
+2. `trinity/perf/resource_sampler.py`
+   提供独立资源采样器，支持启动、停止、导出原始样本和聚合统计。
+3. `trinity/perf/report_utils.py`
+   提供时间序列聚合、百分位数计算和统一 JSON 序列化能力。
+4. `perf/scripts/explorer/main.py`
+   负责 Explorer perf 的命令行入口、单次运行编排和结果落盘。
+5. `perf/scripts/explorer/example.yaml`
+   提供最小可运行的 Trinity Explorer 配置样例。
+
+这种拆分方式的核心目的是让资源采集模块天然进入 `trinity` 命名空间，后续 trainer perf 可以直接复用 `trinity.perf.*`。
+
+## 运行流程草案
+
+Explorer perf 脚本建议按以下阶段执行：
+
+1. 读取 Trinity 配置文件并校验 `mode: explore`。
+2. 校验 `monitor.monitor_type == tensorboard`。
+3. 初始化资源采样器并启动后台采样。
+4. 创建 Explorer actor。
+5. 单独计时 `prepare.remote()`，得到启动耗时。
+6. 执行 `sync_weight.remote()`。
+7. 执行 `explore.remote()` 直到运行结束。
+8. 停止资源采样。
+9. 解析 TensorBoard 本地文件，提取 step 级 metrics。
+10. 聚合资源指标和 Explorer step 指标。
+11. 输出 JSON 结果到指定路径。
+
+这里的“启动耗时”定义为：
+
+1. 从 perf 脚本开始创建 Explorer actor。
+2. 到 `prepare.remote()` 成功返回为止。
+
+这样可以覆盖模型准备、rollout coordinator 准备等初始化成本，同时保持对主流程零侵入。
+
+## 指标来源草案
+
+### 资源指标
+
+资源指标由 perf 目录下的独立采样模块提供，建议采样字段如下：
+
+1. `timestamp`
+2. `cpu_percent`
+3. `memory_rss_mb`
+4. `memory_percent`
+5. `gpu_metrics`
+
+其中 `gpu_metrics` 建议按卡记录，例如：
+
+1. `gpu_id`
+2. `gpu_util_percent`
+3. `gpu_memory_used_mb`
+4. `gpu_memory_total_mb`
+
+第一版建议优先支持整机级采样，不强制做按 Ray actor 或 PID 树聚合。
+
+资源序列的展示目标如下：
+
+1. CPU 只保留一条时间线。
+2. GPU 为每张卡分别保留一条时间线。
+3. 结果格式优先为后续折线图绘制服务，而不是做离线聚合统计。
+
+### Explorer 运行指标
+
+Explorer 运行指标优先从 TensorBoard 事件文件提取，原因如下：
+
+1. 不需要修改 Explorer 主流程。
+2. Explorer 已有 monitor 写本地标量文件。
+3. step 级指标能够自然复用，不需要重新推导内部状态。
+
+因此，第一版建议明确要求 monitor 使用 `tensorboard`。
+
+## 吞吐量与平均任务耗时口径
+
+建议统一采用以下统计口径：
+
+1. step 吞吐量：`finished_task_count / step_time_sec * 60`
+2. step 平均任务耗时：`step_time_sec / finished_task_count`
+3. 全局吞吐量：`sum(finished_task_count) / sum(step_time_sec) * 60`
+4. 全局平均任务耗时：`sum(step_time_sec) / sum(finished_task_count)`
+
+实现时优先直接读取 TensorBoard 中已有的 step 级时间类指标。如果不同配置场景下字段名存在差异，建议在 perf 代码中维护字段映射表，而不要把具体字段名散落在业务逻辑中。
+
+## 输出结果结构草案
+
+结果文件建议输出为一个 JSON 文档，结构如下：
+
+```json
+{
+  "run_meta": {},
+  "timing": {},
+  "resource_timeline": [],
+  "step_metrics": [],
+  "global_metrics": {},
+  "artifacts": {},
+  "status": {}
+}
+```
+
+各字段建议含义如下：
+
+### `run_meta`
+
+记录一次 perf 运行的基础信息：
+
+1. config 路径
+2. explorer 名称
+3. 采样间隔
+4. 启动时间
+5. hostname
+6. pid
+
+### `timing`
+
+记录关键耗时：
+
+1. `startup_time_sec`
+2. `run_time_sec`
+3. `total_time_sec`
+
+### `resource_timeline`
+
+记录原始采样序列，用于后续可视化或 trainer perf 复用。
+
+建议至少包含以下结构：
+
+1. `timestamp`
+2. `cpu_percent`
+3. `memory_rss_mb`
+4. `gpu_metrics`
+
+其中 `gpu_metrics` 为数组，每个元素对应一张卡，例如：
+
+1. `gpu_id`
+2. `gpu_util_percent`
+3. `gpu_memory_used_mb`
+
+结果组织应优先满足以下可视化需求：
+
+1. CPU 一条折线。
+2. GPU utilization 按卡多条折线。
+3. GPU memory used 按卡多条折线。
+
+### `step_metrics`
+
+每个 step 一条记录，建议包含：
+
+1. `step`
+2. `finished_task_count`
+3. `throughput_task_per_min`
+4. `avg_task_time_sec`
+5. `raw_metrics`
+
+### `global_metrics`
+
+记录全局性能指标，建议至少包含：
+
+1. `total_finished_task_count`
+2. `overall_throughput_task_per_min`
+3. `overall_avg_task_time_sec`
+
+### `artifacts`
+
+记录排障和追踪所需路径：
+
+1. `checkpoint_job_dir`
+2. `tensorboard_dir`
+3. `log_dir`
+4. `output_json`
+
+### `status`
+
+记录运行状态：
+
+1. 是否成功完成。
+2. 异常信息。
+3. 是否拿到 GPU 指标。
+
+## 命令行接口草案
+
+当前建议的命令行形式如下：
+
+## 使用方法
+
+```
+python main.py --config <path_to_config> --output-path <path_to_output> [--monitor-interval <interval_in_seconds>]
+```
+
+建议支持以下参数：
+
+1. `--config`
+   Trinity 配置文件路径，要求符合 Trinity 配置规范，且模式为 `explore`。
+2. `--output-path`
+   结果 JSON 输出路径。
+3. `--monitor-interval`
+   资源采样间隔，默认 5 秒。
+4. `--timeout`
+   整次 perf 运行的超时时间，可选。
+5. `--resource-backend`
+   资源采样后端，例如 `auto`、`psutil`、`pynvml`。
+
+## 配置要求
+
+该工具依赖以下配置约束：
+
+1. `mode` 必须为 `explore`。
+2. `monitor.monitor_type` 必须为 `tensorboard`。
+3. Explorer 本身应能在当前环境下正常启动和运行。
+
+如果 monitor 不是 `tensorboard`，建议 perf 工具直接报错退出，而不是在运行时偷偷覆盖用户配置。
+
+## 示例结果草案
+
+下面给出一个结果结构示意：
+
+```json
+{
+  "run_meta": {
+    "config_path": "perf/scripts/explorer/example.yaml",
+    "monitor_interval_sec": 5
+  },
+  "timing": {
+    "startup_time_sec": 32.5,
+    "run_time_sec": 640.2,
+    "total_time_sec": 672.7
+  },
+  "resource_timeline": [
+    {
+      "timestamp": 1710000000.0,
+      "cpu_percent": 71.2,
+      "memory_rss_mb": 18342.0,
+      "gpu_metrics": [
+        {
+          "gpu_id": 0,
+          "gpu_util_percent": 84.0,
+          "gpu_memory_used_mb": 22134.0
+        },
+        {
+          "gpu_id": 1,
+          "gpu_util_percent": 79.0,
+          "gpu_memory_used_mb": 21980.0
+        }
+      ]
+    }
+  ],
+  "step_metrics": [
+    {
+      "step": 1,
+      "finished_task_count": 64,
+      "throughput_task_per_min": 384.0,
+      "avg_task_time_sec": 0.156,
+      "raw_metrics": {}
+    }
+  ],
+  "global_metrics": {
+    "total_finished_task_count": 1024,
+    "overall_throughput_task_per_min": 401.7,
+    "overall_avg_task_time_sec": 0.149
+  }
+}
+```
+
+## 实现清单
+
+下面是建议的实现顺序：
+
+1. 定义结果 JSON schema 和字段口径。
+2. 实现 `trinity/perf/resource_backends.py`。
+3. 实现 `trinity/perf/resource_sampler.py`。
+4. 实现 `trinity/perf/report_utils.py`。
+5. 在 `perf/scripts/explorer/main.py` 中完成单次运行编排。
+6. 在 `perf/scripts/explorer/main.py` 中实现 TensorBoard 指标解析。
+7. 补充 `perf/scripts/explorer/example.yaml`。
+8. 补充测试和文档示例。
+
+## 测试建议
+
+第一版建议优先补以下测试：
+
+1. 资源采样器可以稳定输出 CPU 单线时序和按卡 GPU 时序。
+3. TensorBoard 解析逻辑可以正确提取 step 级 metrics。
+4. 全局吞吐量和平均任务耗时计算口径正确。
+5. Explorer perf 运行失败时仍能输出可诊断的状态字段。
+
+## 已知取舍
+
+第一版的取舍如下：
+
+1. 优先做整机级资源观测，不强制追踪 Ray actor 子进程。
+2. 强依赖 `tensorboard` monitor，不额外兼容 wandb 或 mlflow。
+3. 默认要求运行环境具备 GPU，不额外兼容无 GPU 场景。
+4. 资源结果优先保留时序序列，不在第一版中输出 `mean/max/min/p50/p95` 聚合统计。
+5. 只做单次运行和汇总，不做 warmup 和多次重复实验。
+6. 资源采样和 TensorBoard 指标解析均在 perf 层完成，不入侵主流程。
+
+## 后续扩展方向
+
+该设计后续可以自然扩展到：
+
+1. trainer perf 复用资源采样模块。
+2. 自动生成 Markdown 报告。
+3. 增加 PID 级或进程树级资源观测。
+4. 支持更多 monitor 后端的指标解析。
diff --git a/perf/scripts/explorer/example.yaml b/perf/scripts/explorer/example.yaml
new file mode 100644
index 00000000000..e95697f3978
--- /dev/null
+++ b/perf/scripts/explorer/example.yaml
@@ -0,0 +1,69 @@
+mode: explore
+project: Trinity-RFT
+group: explorer-perf
+name: example
+checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,/tmp/trinity-checkpoints}
+
+algorithm:
+  algorithm_type: ppo
+  repeat_times: 4
+  optimizer:
+    lr: 1e-06
+
+data_processor: {}
+
+model:
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3-0.6B}
+  max_prompt_tokens: 256
+  max_response_tokens: 1024
+  enable_thinking: false
+
+cluster:
+  ray_address: ${oc.env:RAY_ADDRESS,auto}
+  node_num: 1
+  gpu_per_node: 1
+
+buffer:
+  batch_size: 8
+  total_epochs: 1
+  explorer_input:
+    taskset:
+      name: taskset
+      storage_type: file
+      path: ${oc.env:TRINITY_TASKSET_PATH,null}
+      split: train
+      format:
+        prompt_key: question
+        response_key: answer
+      rollout_args:
+        temperature: 1.0
+        logprobs: 0
+      workflow_type: math_workflow
+      reward_fn_type: countdown_reward
+    eval_tasksets: []
+
+explorer:
+  runner_per_model: 4
+  max_timeout: 900
+  max_retry_times: 2
+  eval_interval: 1000
+  eval_on_startup: false
+  rollout_model:
+    engine_num: 1
+    tensor_parallel_size: 1
+    enforce_eager: false
+    enable_prefix_caching: false
+    enable_chunked_prefill: false
+    gpu_memory_utilization: 0.85
+    dtype: bfloat16
+    seed: 42
+    enable_openai_api: false
+
+monitor:
+  monitor_type: tensorboard
+
+synchronizer:
+  sync_method: nccl
+  sync_style: fixed
+  sync_interval: 1
+  sync_timeout: 1200
diff --git a/perf/scripts/explorer/main.py b/perf/scripts/explorer/main.py
new file mode 100644
index 00000000000..898b8fc2609
--- /dev/null
+++ b/perf/scripts/explorer/main.py
@@ -0,0 +1,218 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import socket
+import time
+import traceback
+from pathlib import Path
+from typing import Any, Optional
+
+import ray
+from ray.exceptions import GetTimeoutError, RayTaskError
+
+from trinity.buffer.pipelines.task_pipeline import check_and_run_task_pipeline
+from trinity.common.config import Config, load_config
+from trinity.explorer.explorer import Explorer
+from trinity.perf import (
+    ResourceSampler,
+    TensorBoardScalarReader,
+    build_global_metrics,
+    build_resource_timeline_payload,
+    collect_step_metrics,
+)
+from trinity.utils.plugin_loader import load_plugins
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse CLI arguments."""
+    parser = argparse.ArgumentParser(description="Run Explorer performance collection.")
+    parser.add_argument("--config", required=True, help="Path to the Trinity explorer config.")
+    parser.add_argument("--output-path", required=True, help="Path to the output JSON file.")
+    parser.add_argument(
+        "--monitor-interval",
+        type=float,
+        default=5.0,
+        help="Resource sampling interval in seconds.",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=None,
+        help="Optional timeout in seconds for prepare, sync and explore calls.",
+    )
+    return parser.parse_args()
+
+
+def validate_config(config: Config) -> None:
+    """Validate perf-specific config constraints."""
+    if config.mode != "explore":
+        raise ValueError(f"Explorer perf requires mode 'explore', got '{config.mode}'.")
+    if config.monitor.monitor_type != "tensorboard":
+        raise ValueError(
+            "Explorer perf requires monitor.monitor_type='tensorboard' so step metrics can "
+            "be read from local event files."
+        )
+
+
+def build_output_payload(
+    *,
+    config: Optional[Config],
+    config_path: str,
+    output_path: str,
+    monitor_interval: float,
+    startup_time_sec: Optional[float],
+    run_time_sec: Optional[float],
+    total_time_sec: Optional[float],
+    resource_payload: dict[str, Any],
+    step_metrics: list[dict[str, Any]],
+    success: bool,
+    error: Optional[str],
+) -> dict[str, Any]:
+    """Assemble the final JSON payload."""
+    artifacts = {}
+    explorer_name = None
+    if config is not None:
+        explorer_name = config.explorer.name
+        artifacts = {
+            "checkpoint_job_dir": config.checkpoint_job_dir,
+            "tensorboard_dir": os.path.join(
+                config.monitor.cache_dir, "tensorboard", config.explorer.name
+            ),
+            "log_dir": config.log.save_dir,
+            "output_json": output_path,
+        }
+
+    return {
+        "run_meta": {
+            "config_path": config_path,
+            "explorer_name": explorer_name,
+            "monitor_interval_sec": monitor_interval,
+            "hostname": socket.gethostname(),
+            "pid": os.getpid(),
+            "generated_at": time.time(),
+        },
+        "timing": {
+            "startup_time_sec": startup_time_sec,
+            "run_time_sec": run_time_sec,
+            "total_time_sec": total_time_sec,
+        },
+        **resource_payload,
+        "step_metrics": step_metrics,
+        "global_metrics": build_global_metrics(step_metrics),
+        "artifacts": artifacts,
+        "status": {
+            "success": success,
+            "error": error,
+            "gpu_metrics_available": bool(resource_payload.get("resource_timeline")),
+            "tensorboard_metrics_available": bool(step_metrics),
+        },
+    }
+
+
+def write_output(output_path: str, payload: dict[str, Any]) -> None:
+    """Write the final payload to disk."""
+    output_file = Path(output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    output_file.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def run_explorer_perf(args: argparse.Namespace) -> dict[str, Any]:
+    """Run Explorer perf collection and return the result payload."""
+    load_plugins()
+    config: Optional[Config] = None
+    explorer_actor = None
+    sampler: Optional[ResourceSampler] = None
+    success = False
+    error: Optional[str] = None
+    startup_time_sec: Optional[float] = None
+    run_time_sec: Optional[float] = None
+    total_time_sec: Optional[float] = None
+    resource_payload: dict[str, Any] = {"resource_timeline": [], "chart_series": {}}
+    step_metrics: list[dict[str, Any]] = []
+    startup_started_at: Optional[float] = None
+    run_started_at: Optional[float] = None
+
+    try:
+        config = load_config(args.config)
+        validate_config(config)
+        config.check_and_update()
+
+        ray.init(
+            address=config.cluster.ray_address,
+            ignore_reinit_error=True,
+            namespace=config.ray_namespace,
+            runtime_env={"env_vars": config.get_envs()},
+        )
+        check_and_run_task_pipeline(config)
+
+        sampler = ResourceSampler(interval_seconds=args.monitor_interval)
+        sampler.start()
+
+        startup_started_at = time.perf_counter()
+        explorer_actor = Explorer.get_actor(config)
+        ray.get(explorer_actor.prepare.remote(), timeout=args.timeout)
+        startup_time_sec = time.perf_counter() - startup_started_at
+
+        run_started_at = time.perf_counter()
+        ray.get(explorer_actor.sync_weight.remote(), timeout=args.timeout)
+        ray.get(explorer_actor.explore.remote(), timeout=args.timeout)
+        run_time_sec = time.perf_counter() - run_started_at
+        total_time_sec = time.perf_counter() - startup_started_at
+        success = True
+    except (RuntimeError, ValueError, TimeoutError, GetTimeoutError, RayTaskError):
+        error = traceback.format_exc()
+        if startup_started_at is not None and startup_time_sec is None and run_started_at is None:
+            startup_time_sec = time.perf_counter() - startup_started_at
+        if run_started_at is not None and run_time_sec is None:
+            run_time_sec = time.perf_counter() - run_started_at
+        if startup_started_at is not None and total_time_sec is None:
+            total_time_sec = time.perf_counter() - startup_started_at
+    finally:
+        collected_samples = sampler.stop() if sampler is not None else []
+        resource_payload = build_resource_timeline_payload(collected_samples)
+
+        if explorer_actor is not None and ray.is_initialized():
+            try:
+                ray.get(explorer_actor.shutdown.remote(), timeout=args.timeout)
+            except (RuntimeError, TimeoutError, GetTimeoutError, RayTaskError):
+                if error is None:
+                    error = traceback.format_exc()
+
+        if config is not None:
+            tensorboard_dir = os.path.join(
+                config.monitor.cache_dir, "tensorboard", config.explorer.name
+            )
+            if os.path.isdir(tensorboard_dir):
+                scalar_reader = TensorBoardScalarReader(tensorboard_dir)
+                step_metrics = collect_step_metrics(scalar_reader.metrics)
+
+        if ray.is_initialized():
+            ray.shutdown()
+
+    return build_output_payload(
+        config=config,
+        config_path=args.config,
+        output_path=args.output_path,
+        monitor_interval=args.monitor_interval,
+        startup_time_sec=startup_time_sec,
+        run_time_sec=run_time_sec,
+        total_time_sec=total_time_sec,
+        resource_payload=resource_payload,
+        step_metrics=step_metrics,
+        success=success,
+        error=error,
+    )
+
+
+def main() -> int:
+    """CLI entrypoint."""
+    args = parse_args()
+    payload = run_explorer_perf(args)
+    write_output(args.output_path, payload)
+    return 0 if payload["status"]["success"] else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/pyproject.toml b/pyproject.toml
index b5463a4104b..3e9b9276aca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,8 @@ dependencies = [
     "sortedcontainers",
     "word2number",
     "matplotlib",
+    "psutil",
+    "nvidia-ml-py",
     "transformers>=5.5.3",
     "datasets>=4.0.0",
     "typer>=0.20.1",
diff --git a/trinity/perf/__init__.py b/trinity/perf/__init__.py
new file mode 100644
index 00000000000..717772f4d31
--- /dev/null
+++ b/trinity/perf/__init__.py
@@ -0,0 +1,17 @@
+"""Performance tooling package for Trinity."""
+
+from trinity.perf.explorer_metrics import (
+    TensorBoardScalarReader,
+    build_global_metrics,
+    collect_step_metrics,
+)
+from trinity.perf.report_utils import build_resource_timeline_payload
+from trinity.perf.resource_sampler import ResourceSampler
+
+__all__ = [
+    "ResourceSampler",
+    "TensorBoardScalarReader",
+    "build_global_metrics",
+    "build_resource_timeline_payload",
+    "collect_step_metrics",
+]
\ No newline at end of file
diff --git a/trinity/perf/explorer_metrics.py b/trinity/perf/explorer_metrics.py
new file mode 100644
index 00000000000..84149fe33fa
--- /dev/null
+++ b/trinity/perf/explorer_metrics.py
@@ -0,0 +1,114 @@
+"""Helpers for Explorer performance metric parsing and aggregation."""
+
+from __future__ import annotations
+
+import os
+from collections import defaultdict
+from typing import Any, Optional
+
+from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
+
+STEP_TIME_METRIC_CANDIDATES = (
+    "time/wait_explore_step",
+    "time/explorer_sync_interval",
+)
+FINISHED_TASK_METRIC_NAME = "rollout/finished_task_count"
+
+
+class TensorBoardScalarReader:
+    """Read scalar metrics from TensorBoard event files."""
+
+    def __init__(self, log_dir: str):
+        self.log_dir = log_dir
+        self.metrics = self._load_metrics(log_dir)
+
+    def _load_metrics(self, log_dir: str) -> dict[str, dict[int, float]]:
+        metric_map: dict[str, dict[int, float]] = defaultdict(dict)
+        for event_file in self._find_event_files(log_dir):
+            accumulator = EventAccumulator(event_file)
+            accumulator.Reload()
+            for tag in accumulator.Tags().get("scalars", []):
+                for scalar in accumulator.Scalars(tag):
+                    prior_value = metric_map[tag].get(scalar.step)
+                    if prior_value is None or scalar.value > prior_value:
+                        metric_map[tag][scalar.step] = scalar.value
+        return dict(metric_map)
+
+    def _find_event_files(self, log_dir: str) -> list[str]:
+        event_files: list[str] = []
+        for root, _, files in os.walk(log_dir):
+            for file_name in files:
+                if file_name.startswith("events.out.tfevents."):
+                    event_files.append(os.path.join(root, file_name))
+        return sorted(event_files)
+
+
+def get_step_time(metric_map: dict[str, dict[int, float]], step: int) -> Optional[float]:
+    """Select the best available step duration metric for one step."""
+    for metric_name in STEP_TIME_METRIC_CANDIDATES:
+        if step in metric_map.get(metric_name, {}):
+            return float(metric_map[metric_name][step])
+    return None
+
+
+def extract_raw_metrics_for_step(
+    metric_map: dict[str, dict[int, float]], step: int
+) -> dict[str, float]:
+    """Extract all scalar metrics that were logged for one step."""
+    return {
+        metric_name: float(step_values[step])
+        for metric_name, step_values in metric_map.items()
+        if step in step_values
+    }
+
+
+def collect_step_metrics(metric_map: dict[str, dict[int, float]]) -> list[dict[str, Any]]:
+    """Build per-step metrics from TensorBoard scalars."""
+    step_numbers = sorted(metric_map.get(FINISHED_TASK_METRIC_NAME, {}).keys())
+    step_metrics: list[dict[str, Any]] = []
+    for step in step_numbers:
+        finished_task_count = float(metric_map[FINISHED_TASK_METRIC_NAME][step])
+        step_time_sec = get_step_time(metric_map, step)
+        step_metrics.append(
+            {
+                "step": step,
+                "finished_task_count": finished_task_count,
+                "step_time_sec": step_time_sec,
+                "throughput_task_per_min": (
+                    finished_task_count / step_time_sec * 60.0
+                    if step_time_sec is not None and step_time_sec > 0 and finished_task_count > 0
+                    else None
+                ),
+                "avg_task_time_sec": (
+                    step_time_sec / finished_task_count
+                    if step_time_sec is not None and step_time_sec > 0 and finished_task_count > 0
+                    else None
+                ),
+                "raw_metrics": extract_raw_metrics_for_step(metric_map, step),
+            }
+        )
+    return step_metrics
+
+
+def build_global_metrics(step_metrics: list[dict[str, Any]]) -> dict[str, Optional[float]]:
+    """Aggregate global metrics from per-step records."""
+    total_finished_task_count = float(
+        sum(step_metric["finished_task_count"] for step_metric in step_metrics)
+    )
+    total_step_time_sec = sum(
+        step_metric["step_time_sec"]
+        for step_metric in step_metrics
+        if step_metric["step_time_sec"] is not None
+    )
+    if total_finished_task_count > 0 and total_step_time_sec > 0:
+        overall_throughput = total_finished_task_count / total_step_time_sec * 60.0
+        overall_avg_task_time = total_step_time_sec / total_finished_task_count
+    else:
+        overall_throughput = None
+        overall_avg_task_time = None
+    return {
+        "total_finished_task_count": total_finished_task_count,
+        "overall_throughput_task_per_min": overall_throughput,
+        "overall_avg_task_time_sec": overall_avg_task_time,
+        "total_step_time_sec": total_step_time_sec if total_step_time_sec > 0 else None,
+    }
\ No newline at end of file
diff --git a/trinity/perf/report_utils.py b/trinity/perf/report_utils.py
new file mode 100644
index 00000000000..bc253f80f70
--- /dev/null
+++ b/trinity/perf/report_utils.py
@@ -0,0 +1,59 @@
+"""Reporting helpers for performance tooling."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+
+from trinity.perf.resource_backends import ResourceSample
+
+
+def build_resource_timeline_payload(samples: list[ResourceSample]) -> dict:
+    """Convert raw resource samples into a chart-friendly timeline payload."""
+    timeline = [sample.to_dict() for sample in samples]
+    cpu_series = [
+        {"timestamp": sample.timestamp, "value": sample.cpu_percent} for sample in samples
+    ]
+    memory_rss_series = [
+        {"timestamp": sample.timestamp, "value": sample.memory_rss_mb} for sample in samples
+    ]
+    memory_percent_series = [
+        {"timestamp": sample.timestamp, "value": sample.memory_percent} for sample in samples
+    ]
+
+    gpu_util_series: dict[int, list[dict]] = defaultdict(list)
+    gpu_memory_series: dict[int, list[dict]] = defaultdict(list)
+    gpu_names: dict[int, str] = {}
+    for sample in samples:
+        for gpu_sample in sample.gpu_metrics:
+            gpu_names[gpu_sample.gpu_id] = gpu_sample.name
+            gpu_util_series[gpu_sample.gpu_id].append(
+                {"timestamp": sample.timestamp, "value": gpu_sample.gpu_util_percent}
+            )
+            gpu_memory_series[gpu_sample.gpu_id].append(
+                {"timestamp": sample.timestamp, "value": gpu_sample.gpu_memory_used_mb}
+            )
+
+    return {
+        "resource_timeline": timeline,
+        "chart_series": {
+            "cpu_percent": cpu_series,
+            "memory_rss_mb": memory_rss_series,
+            "memory_percent": memory_percent_series,
+            "gpu_util_percent": {
+                str(gpu_id): {
+                    "gpu_id": gpu_id,
+                    "name": gpu_names[gpu_id],
+                    "values": values,
+                }
+                for gpu_id, values in gpu_util_series.items()
+            },
+            "gpu_memory_used_mb": {
+                str(gpu_id): {
+                    "gpu_id": gpu_id,
+                    "name": gpu_names[gpu_id],
+                    "values": values,
+                }
+                for gpu_id, values in gpu_memory_series.items()
+            },
+        },
+    }
\ No newline at end of file
diff --git a/trinity/perf/resource_backends.py b/trinity/perf/resource_backends.py
new file mode 100644
index 00000000000..913fc324347
--- /dev/null
+++ b/trinity/perf/resource_backends.py
@@ -0,0 +1,119 @@
+"""System resource collection backends for performance tooling."""
+
+from __future__ import annotations
+
+import time
+from dataclasses import asdict, dataclass
+
+import psutil
+from pynvml import (
+    NVMLError,
+    nvmlDeviceGetCount,
+    nvmlDeviceGetHandleByIndex,
+    nvmlDeviceGetMemoryInfo,
+    nvmlDeviceGetName,
+    nvmlDeviceGetUtilizationRates,
+    nvmlInit,
+    nvmlShutdown,
+)
+
+
+@dataclass
+class GPUSample:
+    """One GPU sample at one point in time."""
+
+    gpu_id: int
+    name: str
+    gpu_util_percent: float
+    gpu_memory_used_mb: float
+    gpu_memory_total_mb: float
+
+    def to_dict(self) -> dict:
+        """Serialize the GPU sample to a dictionary."""
+        return asdict(self)
+
+
+@dataclass
+class ResourceSample:
+    """One system resource sample at one point in time."""
+
+    timestamp: float
+    cpu_percent: float
+    memory_rss_mb: float
+    memory_percent: float
+    gpu_metrics: list[GPUSample]
+
+    def to_dict(self) -> dict:
+        """Serialize the resource sample to a dictionary."""
+        payload = asdict(self)
+        payload["gpu_metrics"] = [gpu_sample.to_dict() for gpu_sample in self.gpu_metrics]
+        return payload
+
+
+class SystemResourceBackend:
+    """Collect system-level CPU, memory and per-GPU metrics."""
+
+    def __init__(self) -> None:
+        self._process = psutil.Process()
+        self._initialized = False
+        self._gpu_count = 0
+
+    def open(self) -> None:
+        """Initialize the GPU management library and validate the environment."""
+        if self._initialized:
+            return
+        try:
+            nvmlInit()
+            self._gpu_count = nvmlDeviceGetCount()
+        except NVMLError as error:
+            raise RuntimeError(f"Failed to initialize NVML: {error}") from error
+        if self._gpu_count <= 0:
+            self.close()
+            raise RuntimeError("No GPU devices detected by NVML.")
+
+        self._process.cpu_percent(interval=None)
+        self._initialized = True
+
+    def close(self) -> None:
+        """Release NVML resources."""
+        if not self._initialized:
+            return
+        try:
+            nvmlShutdown()
+        except NVMLError:
+            pass
+        self._initialized = False
+        self._gpu_count = 0
+
+    def sample(self) -> ResourceSample:
+        """Collect one resource sample."""
+        if not self._initialized:
+            raise RuntimeError("SystemResourceBackend must be opened before sampling.")
+
+        timestamp = time.time()
+        memory_info = self._process.memory_info()
+        gpu_metrics: list[GPUSample] = []
+        for gpu_index in range(self._gpu_count):
+            gpu_handle = nvmlDeviceGetHandleByIndex(gpu_index)
+            utilization = nvmlDeviceGetUtilizationRates(gpu_handle)
+            gpu_memory = nvmlDeviceGetMemoryInfo(gpu_handle)
+            gpu_name = nvmlDeviceGetName(gpu_handle)
+            if isinstance(gpu_name, bytes):
+                gpu_name = gpu_name.decode("utf-8")
+            gpu_metrics.append(
+                GPUSample(
+                    gpu_id=gpu_index,
+                    name=str(gpu_name),
+                    gpu_util_percent=float(utilization.gpu),
+                    gpu_memory_used_mb=float(gpu_memory.used) / (1024 * 1024),
+                    gpu_memory_total_mb=float(gpu_memory.total) / (1024 * 1024),
+                )
+            )
+
+        return ResourceSample(
+            timestamp=timestamp,
+            cpu_percent=float(self._process.cpu_percent(interval=None)),
+            memory_rss_mb=float(memory_info.rss) / (1024 * 1024),
+            memory_percent=float(self._process.memory_percent()),
+            gpu_metrics=gpu_metrics,
+        )
\ No newline at end of file
diff --git a/trinity/perf/resource_sampler.py b/trinity/perf/resource_sampler.py
new file mode 100644
index 00000000000..19b4073b144
--- /dev/null
+++ b/trinity/perf/resource_sampler.py
@@ -0,0 +1,69 @@
+"""Threaded resource sampler for performance tooling."""
+
+from __future__ import annotations
+
+import threading
+import time
+from typing import Optional
+
+from trinity.perf.resource_backends import ResourceSample, SystemResourceBackend
+
+
+class ResourceSampler:
+    """Periodically collect system resource samples in a background thread."""
+
+    def __init__(
+        self,
+        interval_seconds: float,
+        backend: Optional[SystemResourceBackend] = None,
+    ) -> None:
+        if interval_seconds <= 0:
+            raise ValueError("interval_seconds must be greater than 0.")
+        self.interval_seconds = interval_seconds
+        self.backend = backend or SystemResourceBackend()
+        self._samples: list[ResourceSample] = []
+        self._lock = threading.Lock()
+        self._stop_event = threading.Event()
+        self._thread: Optional[threading.Thread] = None
+        self._started = False
+
+    def start(self) -> None:
+        """Start sampling in the background."""
+        if self._started:
+            raise RuntimeError("ResourceSampler has already been started.")
+        self.backend.open()
+        self._started = True
+        self._stop_event.clear()
+        self._thread = threading.Thread(target=self._run, name="resource-sampler", daemon=True)
+        self._thread.start()
+
+    def stop(self) -> list[ResourceSample]:
+        """Stop sampling and return the collected samples."""
+        if not self._started:
+            return self.samples()
+        self._stop_event.set()
+        if self._thread is not None:
+            self._thread.join()
+            self._thread = None
+        self.backend.close()
+        self._started = False
+        return self.samples()
+
+    def samples(self) -> list[ResourceSample]:
+        """Return a snapshot of all collected samples."""
+        with self._lock:
+            return list(self._samples)
+
+    def _run(self) -> None:
+        next_sample_time = time.monotonic()
+        while not self._stop_event.is_set():
+            self._collect_once()
+            next_sample_time += self.interval_seconds
+            remaining_time = max(0.0, next_sample_time - time.monotonic())
+            if self._stop_event.wait(remaining_time):
+                break
+
+    def _collect_once(self) -> None:
+        sample = self.backend.sample()
+        with self._lock:
+            self._samples.append(sample)
\ No newline at end of file

From f8993a7cffc65c965d97b6bd1e16669d6b9382b5 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Tue, 28 Apr 2026 12:02:50 +0800
Subject: [PATCH 02/20] update example

---
 perf/scripts/explorer/example.yaml | 31 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/perf/scripts/explorer/example.yaml b/perf/scripts/explorer/example.yaml
index e95697f3978..d7efc4a470a 100644
--- a/perf/scripts/explorer/example.yaml
+++ b/perf/scripts/explorer/example.yaml
@@ -5,27 +5,22 @@ name: example
 checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,/tmp/trinity-checkpoints}
 
 algorithm:
-  algorithm_type: ppo
-  repeat_times: 4
-  optimizer:
-    lr: 1e-06
-
-data_processor: {}
+  algorithm_type: grpo
+  repeat_times: 8
 
 model:
-  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3-0.6B}
-  max_prompt_tokens: 256
-  max_response_tokens: 1024
-  enable_thinking: false
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3.5-0.8B}
+  max_prompt_tokens: 32768
+  max_response_tokens: 8192
 
 cluster:
   ray_address: ${oc.env:RAY_ADDRESS,auto}
   node_num: 1
-  gpu_per_node: 1
+  gpu_per_node: 4
 
 buffer:
-  batch_size: 8
-  total_epochs: 1
+  batch_size: 32
+  total_steps: 5
   explorer_input:
     taskset:
       name: taskset
@@ -45,15 +40,15 @@ buffer:
 explorer:
   runner_per_model: 4
   max_timeout: 900
-  max_retry_times: 2
+  max_retry_times: 0
   eval_interval: 1000
   eval_on_startup: false
   rollout_model:
     engine_num: 1
     tensor_parallel_size: 1
     enforce_eager: false
-    enable_prefix_caching: false
-    enable_chunked_prefill: false
+    enable_prefix_caching: true
+    enable_chunked_prefill: true
     gpu_memory_utilization: 0.85
     dtype: bfloat16
     seed: 42
@@ -63,7 +58,7 @@ monitor:
   monitor_type: tensorboard
 
 synchronizer:
-  sync_method: nccl
+  sync_method: checkpoint
   sync_style: fixed
-  sync_interval: 1
+  sync_interval: 20
   sync_timeout: 1200

From 93e2df1dc7b9034218d7a0a137c848a319960db1 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Tue, 28 Apr 2026 14:55:34 +0800
Subject: [PATCH 03/20] add perf command

---
 perf/scripts/explorer/README.md               | 18 ++--
 perf/scripts/explorer/example.yaml            | 64 -------------
 trinity/cli/launcher.py                       | 52 ++++++++++
 trinity/perf/__init__.py                      |  8 ++
 .../main.py => trinity/perf/explorer_perf.py  | 96 +++++++------------
 5 files changed, 103 insertions(+), 135 deletions(-)
 delete mode 100644 perf/scripts/explorer/example.yaml
 rename perf/scripts/explorer/main.py => trinity/perf/explorer_perf.py (70%)

diff --git a/perf/scripts/explorer/README.md b/perf/scripts/explorer/README.md
index 88681d2f8b9..6bda9823cf3 100644
--- a/perf/scripts/explorer/README.md
+++ b/perf/scripts/explorer/README.md
@@ -56,10 +56,10 @@ perf/
     explorer/
       README.md
       example.yaml
-      main.py
 trinity/
   perf/
     __init__.py
+    explorer_perf.py
     explorer_metrics.py
     report_utils.py
     resource_backends.py
@@ -74,8 +74,8 @@ trinity/
    提供独立资源采样器，支持启动、停止、导出原始样本和聚合统计。
 3. `trinity/perf/report_utils.py`
    提供时间序列聚合、百分位数计算和统一 JSON 序列化能力。
-4. `perf/scripts/explorer/main.py`
-   负责 Explorer perf 的命令行入口、单次运行编排和结果落盘。
+4. `trinity/perf/explorer_perf.py`
+  负责 Explorer perf 的单次运行编排和结果落盘。
 5. `perf/scripts/explorer/example.yaml`
    提供最小可运行的 Trinity Explorer 配置样例。
 
@@ -254,7 +254,7 @@ Explorer 运行指标优先从 TensorBoard 事件文件提取，原因如下：
 ## 使用方法
 
 ```
-python main.py --config <path_to_config> --output-path <path_to_output> [--monitor-interval <interval_in_seconds>]
+python -m trinity.cli.launcher perf --module explorer --config <path_to_config> --output-path <path_to_output> [--monitor-interval <interval_in_seconds>]
 ```
 
 建议支持以下参数：
@@ -267,8 +267,10 @@ python main.py --config <path_to_config> --output-path <path_to_output> [--monit
    资源采样间隔，默认 5 秒。
 4. `--timeout`
    整次 perf 运行的超时时间，可选。
-5. `--resource-backend`
-   资源采样后端，例如 `auto`、`psutil`、`pynvml`。
+5. `--total-steps`
+  覆盖配置中的 Explorer 总步数，默认 5。
+6. `--module`
+  当前固定为 `explorer`，为后续扩展 trainer perf 预留统一入口。
 
 ## 配置要求
 
@@ -339,8 +341,8 @@ python main.py --config <path_to_config> --output-path <path_to_output> [--monit
 2. 实现 `trinity/perf/resource_backends.py`。
 3. 实现 `trinity/perf/resource_sampler.py`。
 4. 实现 `trinity/perf/report_utils.py`。
-5. 在 `perf/scripts/explorer/main.py` 中完成单次运行编排。
-6. 在 `perf/scripts/explorer/main.py` 中实现 TensorBoard 指标解析。
+5. 在 `trinity/perf/explorer_perf.py` 中完成单次运行编排。
+6. 在 `trinity/perf/explorer_perf.py` 中实现 TensorBoard 指标解析。
 7. 补充 `perf/scripts/explorer/example.yaml`。
 8. 补充测试和文档示例。
 
diff --git a/perf/scripts/explorer/example.yaml b/perf/scripts/explorer/example.yaml
deleted file mode 100644
index d7efc4a470a..00000000000
--- a/perf/scripts/explorer/example.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-mode: explore
-project: Trinity-RFT
-group: explorer-perf
-name: example
-checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,/tmp/trinity-checkpoints}
-
-algorithm:
-  algorithm_type: grpo
-  repeat_times: 8
-
-model:
-  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3.5-0.8B}
-  max_prompt_tokens: 32768
-  max_response_tokens: 8192
-
-cluster:
-  ray_address: ${oc.env:RAY_ADDRESS,auto}
-  node_num: 1
-  gpu_per_node: 4
-
-buffer:
-  batch_size: 32
-  total_steps: 5
-  explorer_input:
-    taskset:
-      name: taskset
-      storage_type: file
-      path: ${oc.env:TRINITY_TASKSET_PATH,null}
-      split: train
-      format:
-        prompt_key: question
-        response_key: answer
-      rollout_args:
-        temperature: 1.0
-        logprobs: 0
-      workflow_type: math_workflow
-      reward_fn_type: countdown_reward
-    eval_tasksets: []
-
-explorer:
-  runner_per_model: 4
-  max_timeout: 900
-  max_retry_times: 0
-  eval_interval: 1000
-  eval_on_startup: false
-  rollout_model:
-    engine_num: 1
-    tensor_parallel_size: 1
-    enforce_eager: false
-    enable_prefix_caching: true
-    enable_chunked_prefill: true
-    gpu_memory_utilization: 0.85
-    dtype: bfloat16
-    seed: 42
-    enable_openai_api: false
-
-monitor:
-  monitor_type: tensorboard
-
-synchronizer:
-  sync_method: checkpoint
-  sync_style: fixed
-  sync_interval: 20
-  sync_timeout: 1200
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
index 98aec921a12..99b62ea9106 100644
--- a/trinity/cli/launcher.py
+++ b/trinity/cli/launcher.py
@@ -14,6 +14,7 @@
 from trinity.common.constants import DEBUG_NAMESPACE, PLUGIN_DIRS_ENV_VAR
 from trinity.manager.checkpoint_converter import Converter
 from trinity.manager.state_manager import StateManager
+from trinity.perf import ExplorerPerfOptions, run_explorer_perf, write_explorer_perf_output
 from trinity.utils.dlc_utils import is_running, setup_ray_cluster, stop_ray_cluster
 from trinity.utils.log import get_logger
 from trinity.utils.plugin_loader import load_plugins
@@ -278,6 +279,57 @@ def studio(
     ConfigManager.run(port)
 
 
+@app.command()
+def perf(
+    config: Annotated[
+        str,
+        typer.Option("--config", "-c", help="Path to the config file."),
+    ],
+    module: Annotated[
+        str,
+        typer.Option("--module", "-m", help="Perf module to run. Currently only supports 'explorer'."),
+    ] = "explorer",
+    output_path: Annotated[
+        str,
+        typer.Option("--output-path", "-o", help="Path to the output JSON file."),
+    ] = "perf/scripts/explorer/output/perf_result.json",
+    monitor_interval: Annotated[
+        float,
+        typer.Option("--monitor-interval", help="Resource sampling interval in seconds."),
+    ] = 5.0,
+    total_steps: Annotated[
+        int,
+        typer.Option("--total-steps", help="Total steps to run the explorer for."),
+    ] = 5,
+    timeout: Annotated[
+        Optional[float],
+        typer.Option("--timeout", help="Optional timeout in seconds for prepare, sync and explore calls."),
+    ] = None,
+    plugin_dir: Annotated[
+        Optional[str],
+        typer.Option("--plugin-dir", help="Path to the directory containing plugin modules."),
+    ] = None,
+) -> None:
+    """Run performance benchmark."""
+    if module != "explorer":
+        raise typer.BadParameter("Only --module explorer is supported for now.")
+
+    if plugin_dir:
+        os.environ[PLUGIN_DIRS_ENV_VAR] = plugin_dir
+
+    options = ExplorerPerfOptions(
+        config_path=config,
+        output_path=output_path,
+        monitor_interval=monitor_interval,
+        total_steps=total_steps,
+        timeout=timeout,
+    )
+    payload = run_explorer_perf(options)
+    write_explorer_perf_output(output_path, payload)
+    if not payload["status"]["success"]:
+        raise typer.Exit(code=1)
+
+
 @app.command()
 def debug(
     config: Annotated[
diff --git a/trinity/perf/__init__.py b/trinity/perf/__init__.py
index 717772f4d31..d638a9f35e1 100644
--- a/trinity/perf/__init__.py
+++ b/trinity/perf/__init__.py
@@ -5,13 +5,21 @@
     build_global_metrics,
     collect_step_metrics,
 )
+from trinity.perf.explorer_perf import (
+    ExplorerPerfOptions,
+    run_explorer_perf,
+    write_explorer_perf_output,
+)
 from trinity.perf.report_utils import build_resource_timeline_payload
 from trinity.perf.resource_sampler import ResourceSampler
 
 __all__ = [
+    "ExplorerPerfOptions",
     "ResourceSampler",
     "TensorBoardScalarReader",
     "build_global_metrics",
     "build_resource_timeline_payload",
     "collect_step_metrics",
+    "run_explorer_perf",
+    "write_explorer_perf_output",
 ]
\ No newline at end of file
diff --git a/perf/scripts/explorer/main.py b/trinity/perf/explorer_perf.py
similarity index 70%
rename from perf/scripts/explorer/main.py
rename to trinity/perf/explorer_perf.py
index 898b8fc2609..8aeab1f4b5a 100644
--- a/perf/scripts/explorer/main.py
+++ b/trinity/perf/explorer_perf.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
-import argparse
 import json
 import os
 import socket
 import time
 import traceback
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional
 
@@ -15,53 +15,35 @@
 from trinity.buffer.pipelines.task_pipeline import check_and_run_task_pipeline
 from trinity.common.config import Config, load_config
 from trinity.explorer.explorer import Explorer
-from trinity.perf import (
-    ResourceSampler,
+from trinity.perf.explorer_metrics import (
     TensorBoardScalarReader,
     build_global_metrics,
-    build_resource_timeline_payload,
     collect_step_metrics,
 )
+from trinity.perf.report_utils import build_resource_timeline_payload
+from trinity.perf.resource_sampler import ResourceSampler
 from trinity.utils.plugin_loader import load_plugins
 
 
-def parse_args() -> argparse.Namespace:
-    """Parse CLI arguments."""
-    parser = argparse.ArgumentParser(description="Run Explorer performance collection.")
-    parser.add_argument("--config", required=True, help="Path to the Trinity explorer config.")
-    parser.add_argument("--output-path", required=True, help="Path to the output JSON file.")
-    parser.add_argument(
-        "--monitor-interval",
-        type=float,
-        default=5.0,
-        help="Resource sampling interval in seconds.",
-    )
-    parser.add_argument(
-        "--timeout",
-        type=float,
-        default=None,
-        help="Optional timeout in seconds for prepare, sync and explore calls.",
-    )
-    return parser.parse_args()
-
-
-def validate_config(config: Config) -> None:
+@dataclass(slots=True)
+class ExplorerPerfOptions:
+    config_path: str
+    output_path: str
+    monitor_interval: float = 5.0
+    total_steps: int = 5
+    timeout: Optional[float] = None
+
+
+def validate_explorer_perf_config(config: Config) -> None:
     """Validate perf-specific config constraints."""
     if config.mode != "explore":
         raise ValueError(f"Explorer perf requires mode 'explore', got '{config.mode}'.")
-    if config.monitor.monitor_type != "tensorboard":
-        raise ValueError(
-            "Explorer perf requires monitor.monitor_type='tensorboard' so step metrics can "
-            "be read from local event files."
-        )
 
 
-def build_output_payload(
+def build_explorer_perf_payload(
     *,
     config: Optional[Config],
-    config_path: str,
-    output_path: str,
-    monitor_interval: float,
+    options: ExplorerPerfOptions,
     startup_time_sec: Optional[float],
     run_time_sec: Optional[float],
     total_time_sec: Optional[float],
@@ -81,14 +63,14 @@ def build_output_payload(
                 config.monitor.cache_dir, "tensorboard", config.explorer.name
             ),
             "log_dir": config.log.save_dir,
-            "output_json": output_path,
+            "output_json": options.output_path,
         }
 
     return {
         "run_meta": {
-            "config_path": config_path,
+            "config_path": options.config_path,
             "explorer_name": explorer_name,
-            "monitor_interval_sec": monitor_interval,
+            "monitor_interval_sec": options.monitor_interval,
             "hostname": socket.gethostname(),
             "pid": os.getpid(),
             "generated_at": time.time(),
@@ -111,14 +93,14 @@ def build_output_payload(
     }
 
 
-def write_output(output_path: str, payload: dict[str, Any]) -> None:
+def write_explorer_perf_output(output_path: str, payload: dict[str, Any]) -> None:
     """Write the final payload to disk."""
     output_file = Path(output_path)
     output_file.parent.mkdir(parents=True, exist_ok=True)
     output_file.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
 
 
-def run_explorer_perf(args: argparse.Namespace) -> dict[str, Any]:
+def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
     """Run Explorer perf collection and return the result payload."""
     load_plugins()
     config: Optional[Config] = None
@@ -135,8 +117,10 @@ def run_explorer_perf(args: argparse.Namespace) -> dict[str, Any]:
     run_started_at: Optional[float] = None
 
     try:
-        config = load_config(args.config)
-        validate_config(config)
+        config = load_config(options.config_path)
+        config.buffer.total_steps = options.total_steps
+        config.monitor.monitor_type = "tensorboard"
+        validate_explorer_perf_config(config)
         config.check_and_update()
 
         ray.init(
@@ -147,17 +131,17 @@ def run_explorer_perf(args: argparse.Namespace) -> dict[str, Any]:
         )
         check_and_run_task_pipeline(config)
 
-        sampler = ResourceSampler(interval_seconds=args.monitor_interval)
+        sampler = ResourceSampler(interval_seconds=options.monitor_interval)
         sampler.start()
 
         startup_started_at = time.perf_counter()
         explorer_actor = Explorer.get_actor(config)
-        ray.get(explorer_actor.prepare.remote(), timeout=args.timeout)
+        ray.get(explorer_actor.prepare.remote(), timeout=options.timeout)
         startup_time_sec = time.perf_counter() - startup_started_at
 
         run_started_at = time.perf_counter()
-        ray.get(explorer_actor.sync_weight.remote(), timeout=args.timeout)
-        ray.get(explorer_actor.explore.remote(), timeout=args.timeout)
+        ray.get(explorer_actor.sync_weight.remote(), timeout=options.timeout)
+        ray.get(explorer_actor.explore.remote(), timeout=options.timeout)
         run_time_sec = time.perf_counter() - run_started_at
         total_time_sec = time.perf_counter() - startup_started_at
         success = True
@@ -175,7 +159,7 @@ def run_explorer_perf(args: argparse.Namespace) -> dict[str, Any]:
 
         if explorer_actor is not None and ray.is_initialized():
             try:
-                ray.get(explorer_actor.shutdown.remote(), timeout=args.timeout)
+                ray.get(explorer_actor.shutdown.remote(), timeout=options.timeout)
             except (RuntimeError, TimeoutError, GetTimeoutError, RayTaskError):
                 if error is None:
                     error = traceback.format_exc()
@@ -191,11 +175,9 @@ def run_explorer_perf(args: argparse.Namespace) -> dict[str, Any]:
         if ray.is_initialized():
             ray.shutdown()
 
-    return build_output_payload(
+    return build_explorer_perf_payload(
         config=config,
-        config_path=args.config,
-        output_path=args.output_path,
-        monitor_interval=args.monitor_interval,
+        options=options,
         startup_time_sec=startup_time_sec,
         run_time_sec=run_time_sec,
         total_time_sec=total_time_sec,
@@ -203,16 +185,4 @@ def run_explorer_perf(args: argparse.Namespace) -> dict[str, Any]:
         step_metrics=step_metrics,
         success=success,
         error=error,
-    )
-
-
-def main() -> int:
-    """CLI entrypoint."""
-    args = parse_args()
-    payload = run_explorer_perf(args)
-    write_output(args.output_path, payload)
-    return 0 if payload["status"]["success"] else 1
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
+    )
\ No newline at end of file

From 958c7d649e3dda9de6ac768ddd3788db4b33469e Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Tue, 28 Apr 2026 19:46:47 +0800
Subject: [PATCH 04/20] add perf suite

---
 .../source/tutorial/metrics_reference.md      |   2 +-
 .../source_zh/tutorial/metrics_reference.md   |   2 +-
 perf/scripts/explorer/README.md               |  10 +-
 .../buffer/pipelines/experience_pipeline.py   |   6 +-
 trinity/cli/launcher.py                       | 250 ++++++++++++++----
 trinity/explorer/explorer.py                  |   4 +-
 trinity/perf/__init__.py                      |  18 +-
 trinity/perf/report_utils.py                  |   2 +-
 trinity/perf/resource_backends.py             |   2 +-
 trinity/perf/resource_sampler.py              |   2 +-
 .../perf/{explorer_perf.py => stage_perf.py}  |  51 ++--
 ...orer_metrics.py => tensorboard_metrics.py} |   4 +-
 12 files changed, 242 insertions(+), 111 deletions(-)
 rename trinity/perf/{explorer_perf.py => stage_perf.py} (74%)
 rename trinity/perf/{explorer_metrics.py => tensorboard_metrics.py} (98%)

diff --git a/docs/sphinx_doc/source/tutorial/metrics_reference.md b/docs/sphinx_doc/source/tutorial/metrics_reference.md
index 205e9eb7af8..66cf67397ea 100644
--- a/docs/sphinx_doc/source/tutorial/metrics_reference.md
+++ b/docs/sphinx_doc/source/tutorial/metrics_reference.md
@@ -164,7 +164,7 @@ This category includes metrics that track the training dynamics of the policy (a
 This category includes metrics that track the processing of experiences through various pipeline operators (`experience_pipeline/`) and data sampling statistics (`sample/`). These metrics are aggregated at the step level, as the experience pipeline and data sampling are performed in each step.
 
 
-#### Experience Pipeline Metrics (`experience_pipeline/` and `time/experience_pipeline/`)
+#### Experience Pipeline Metrics (`experience_pipeline/` and `experience_pipeline/time/`)
 
 Experience pipeline metrics track the processing of experiences through various pipeline operators. Each metric represents the count of the specific operator in one step.
 
diff --git a/docs/sphinx_doc/source_zh/tutorial/metrics_reference.md b/docs/sphinx_doc/source_zh/tutorial/metrics_reference.md
index 0fad6c42a23..a3e9792c701 100644
--- a/docs/sphinx_doc/source_zh/tutorial/metrics_reference.md
+++ b/docs/sphinx_doc/source_zh/tutorial/metrics_reference.md
@@ -165,7 +165,7 @@ graph TD
 此类别包括跟踪通过各种数据处理操作（`experience_pipeline/`）和数据采样统计（`sample/`）的指标。这些指标在步骤（step）级别计算，因为 experience 处理和数据采样会在每个步骤中执行一次。
 
 
-#### Experience Pipeline 相关指标（`experience_pipeline/` 和 `time/experience_pipeline/`）
+#### Experience Pipeline 相关指标（`experience_pipeline/` 和 `experience_pipeline/time/`）
 
 Experience Pipeline 相关的指标统计了和数据处理相关的值，每个指标表示一个步骤中特定操作的统计值。
 
diff --git a/perf/scripts/explorer/README.md b/perf/scripts/explorer/README.md
index 6bda9823cf3..27556ffd09a 100644
--- a/perf/scripts/explorer/README.md
+++ b/perf/scripts/explorer/README.md
@@ -59,8 +59,8 @@ perf/
 trinity/
   perf/
     __init__.py
-    explorer_perf.py
-    explorer_metrics.py
+    stage_perf.py
+    tensorboard_metrics.py
     report_utils.py
     resource_backends.py
     resource_sampler.py
@@ -74,7 +74,7 @@ trinity/
    提供独立资源采样器，支持启动、停止、导出原始样本和聚合统计。
 3. `trinity/perf/report_utils.py`
    提供时间序列聚合、百分位数计算和统一 JSON 序列化能力。
-4. `trinity/perf/explorer_perf.py`
+4. `trinity/perf/stage_perf.py`
   负责 Explorer perf 的单次运行编排和结果落盘。
 5. `perf/scripts/explorer/example.yaml`
    提供最小可运行的 Trinity Explorer 配置样例。
@@ -341,8 +341,8 @@ python -m trinity.cli.launcher perf --module explorer --config <path_to_config>
 2. 实现 `trinity/perf/resource_backends.py`。
 3. 实现 `trinity/perf/resource_sampler.py`。
 4. 实现 `trinity/perf/report_utils.py`。
-5. 在 `trinity/perf/explorer_perf.py` 中完成单次运行编排。
-6. 在 `trinity/perf/explorer_perf.py` 中实现 TensorBoard 指标解析。
+5. 在 `trinity/perf/stage_perf.py` 中完成单次运行编排。
+6. 在 `trinity/perf/stage_perf.py` 中实现 TensorBoard 指标解析。
 7. 补充 `perf/scripts/explorer/example.yaml`。
 8. 补充测试和文档示例。
 
diff --git a/trinity/buffer/pipelines/experience_pipeline.py b/trinity/buffer/pipelines/experience_pipeline.py
index 8052d8ccd1e..d7b0b4acb49 100644
--- a/trinity/buffer/pipelines/experience_pipeline.py
+++ b/trinity/buffer/pipelines/experience_pipeline.py
@@ -152,16 +152,16 @@ async def process(self, exp_bytes: bytes) -> Dict:
         # Process experiences through operators
         for idx, operator in enumerate(self.operators):
             with Timer(
-                metrics, f"time/experience_pipeline/operator/{idx}_{operator.__class__.__name__}"
+                metrics, f"experience_pipeline/time/operator/{idx}_{operator.__class__.__name__}"
             ):
                 exps, metric = await operator.process(exps)
                 metrics.update(metric)
         metrics["experience_count"] = len(exps)
 
         # Write processed experiences to output buffer
-        with Timer(metrics, "time/experience_pipeline/write"):
+        with Timer(metrics, "experience_pipeline/time/write"):
             await self.output.write_async(exps)
-        metrics["time/experience_pipeline/total"] = time.time() - st
+        metrics["experience_pipeline/time/total"] = time.time() - st
 
         # prefix metrics keys with 'pipeline/'
         result_metrics = {}
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
index 99b62ea9106..334ea71d9ed 100644
--- a/trinity/cli/launcher.py
+++ b/trinity/cli/launcher.py
@@ -2,7 +2,9 @@
 import asyncio
 import os
 import sys
+import time
 import traceback
+from dataclasses import dataclass
 from pprint import pprint
 from typing import Optional
 
@@ -14,7 +16,11 @@
 from trinity.common.constants import DEBUG_NAMESPACE, PLUGIN_DIRS_ENV_VAR
 from trinity.manager.checkpoint_converter import Converter
 from trinity.manager.state_manager import StateManager
-from trinity.perf import ExplorerPerfOptions, run_explorer_perf, write_explorer_perf_output
+from trinity.perf import (
+    ExplorerPerfOptions,
+    run_explorer_perf,
+    write_explorer_perf_output,
+)
 from trinity.utils.dlc_utils import is_running, setup_ray_cluster, stop_ray_cluster
 from trinity.utils.log import get_logger
 from trinity.utils.plugin_loader import load_plugins
@@ -28,71 +34,190 @@
 )
 
 
-def bench(config: Config) -> None:
+@dataclass(slots=True)
+class StageError:
+    type_name: str
+    message: str
+    traceback_text: str
+
+
+@dataclass(slots=True)
+class StageStatus:
+    stage: str
+    success: bool
+    startup_time_sec: Optional[float] = None
+    run_time_sec: Optional[float] = None
+    total_time_sec: Optional[float] = None
+    error: Optional[StageError] = None
+
+
+def _build_stage_error(error: BaseException) -> StageError:
+    return StageError(
+        type_name=type(error).__name__,
+        message=str(error),
+        traceback_text=traceback.format_exc(),
+    )
+
+
+def bench(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
     """Evaluate model."""
     from trinity.explorer.explorer import Explorer
 
     config.explorer.name = "benchmark"
     explorer = Explorer.get_actor(config)
+    startup_started_at = time.perf_counter()
+    startup_time_sec: Optional[float] = None
     try:
-        ray.get(explorer.prepare.remote())
-        ray.get(explorer.benchmark.remote())
+        ray.get(explorer.prepare.remote(), timeout=timeout)
+        startup_time_sec = time.perf_counter() - startup_started_at
+
+        run_started_at = time.perf_counter()
+        ray.get(explorer.benchmark.remote(), timeout=timeout)
+        run_time_sec = time.perf_counter() - run_started_at
         logger.info("Benchmark finished.")
-    except Exception:
-        logger.error(f"Benchmark failed:\n{traceback.format_exc()}")
+        return StageStatus(
+            stage="bench",
+            success=True,
+            startup_time_sec=startup_time_sec,
+            run_time_sec=run_time_sec,
+            total_time_sec=time.perf_counter() - startup_started_at,
+        )
+    except Exception as exc:
+        error = _build_stage_error(exc)
+        logger.error(f"Benchmark failed:\n{error.traceback_text}")
+        return StageStatus(
+            stage="bench",
+            success=False,
+            startup_time_sec=startup_time_sec,
+            run_time_sec=None,
+            total_time_sec=time.perf_counter() - startup_started_at,
+            error=error,
+        )
     finally:
-        ray.get(explorer.shutdown.remote())
+        ray.get(explorer.shutdown.remote(), timeout=timeout)
 
 
-def explore(config: Config) -> None:
+def explore(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
     """Run explorer."""
     from trinity.explorer.explorer import Explorer
 
     explorer = Explorer.get_actor(config)
+    startup_started_at = time.perf_counter()
+    startup_time_sec: Optional[float] = None
+    run_started_at: Optional[float] = None
 
     try:
-        ray.get(explorer.prepare.remote())
-        ray.get(explorer.sync_weight.remote())
-        ray.get(explorer.explore.remote())
-    except Exception:
-        logger.error(f"Explorer failed:\n{traceback.format_exc()}")
+        ray.get(explorer.prepare.remote(), timeout=timeout)
+        startup_time_sec = time.perf_counter() - startup_started_at
+
+        run_started_at = time.perf_counter()
+        ray.get(explorer.sync_weight.remote(), timeout=timeout)
+        ray.get(explorer.explore.remote(), timeout=timeout)
+        run_time_sec = time.perf_counter() - run_started_at
+        return StageStatus(
+            stage="explore",
+            success=True,
+            startup_time_sec=startup_time_sec,
+            run_time_sec=run_time_sec,
+            total_time_sec=time.perf_counter() - startup_started_at,
+        )
+    except Exception as exc:
+        error = _build_stage_error(exc)
+        logger.error(f"Explorer failed:\n{error.traceback_text}")
+        run_time_sec = time.perf_counter() - run_started_at if run_started_at is not None else None
+        return StageStatus(
+            stage="explore",
+            success=False,
+            startup_time_sec=startup_time_sec,
+            run_time_sec=run_time_sec,
+            total_time_sec=time.perf_counter() - startup_started_at,
+            error=error,
+        )
     finally:
-        ray.get(explorer.shutdown.remote())
+        ray.get(explorer.shutdown.remote(), timeout=timeout)
 
 
-def train(config: Config) -> None:
+def train(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
     """Run trainer."""
     from trinity.trainer.trainer import Trainer
 
     trainer = Trainer.get_actor(config)
+    startup_started_at = time.perf_counter()
+    startup_time_sec: Optional[float] = None
+    run_started_at: Optional[float] = None
 
     try:
-        ray.get(trainer.prepare.remote())
-        ray.get(trainer.sync_weight.remote())
-        ray.get(trainer.train.remote())
-    except Exception:
-        logger.error(f"Trainer failed:\n{traceback.format_exc()}")
+        ray.get(trainer.prepare.remote(), timeout=timeout)
+        startup_time_sec = time.perf_counter() - startup_started_at
+
+        run_started_at = time.perf_counter()
+        ray.get(trainer.sync_weight.remote(), timeout=timeout)
+        ray.get(trainer.train.remote(), timeout=timeout)
+        run_time_sec = time.perf_counter() - run_started_at
+        return StageStatus(
+            stage="train",
+            success=True,
+            startup_time_sec=startup_time_sec,
+            run_time_sec=run_time_sec,
+            total_time_sec=time.perf_counter() - startup_started_at,
+        )
+    except Exception as exc:
+        error = _build_stage_error(exc)
+        logger.error(f"Trainer failed:\n{error.traceback_text}")
+        run_time_sec = time.perf_counter() - run_started_at if run_started_at is not None else None
+        return StageStatus(
+            stage="train",
+            success=False,
+            startup_time_sec=startup_time_sec,
+            run_time_sec=run_time_sec,
+            total_time_sec=time.perf_counter() - startup_started_at,
+            error=error,
+        )
     finally:
-        ray.get(trainer.shutdown.remote())
+        ray.get(trainer.shutdown.remote(), timeout=timeout)
 
 
-def serve(config: Config) -> None:
+def serve(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
     """Run explorer in server mode."""
     from trinity.explorer.explorer import Explorer
 
     explorer = Explorer.get_actor(config)
+    startup_started_at = time.perf_counter()
+    startup_time_sec: Optional[float] = None
+    run_started_at: Optional[float] = None
 
     try:
-        ray.get(explorer.prepare.remote())
-        ray.get(explorer.sync_weight.remote())
-        ray.get(explorer.serve.remote())
-    except Exception:
-        logger.error(f"Explorer failed:\n{traceback.format_exc()}")
+        ray.get(explorer.prepare.remote(), timeout=timeout)
+        startup_time_sec = time.perf_counter() - startup_started_at
+
+        run_started_at = time.perf_counter()
+        ray.get(explorer.sync_weight.remote(), timeout=timeout)
+        ray.get(explorer.serve.remote(), timeout=timeout)
+        run_time_sec = time.perf_counter() - run_started_at
+        return StageStatus(
+            stage="serve",
+            success=True,
+            startup_time_sec=startup_time_sec,
+            run_time_sec=run_time_sec,
+            total_time_sec=time.perf_counter() - startup_started_at,
+        )
+    except Exception as exc:
+        error = _build_stage_error(exc)
+        logger.error(f"Explorer failed:\n{error.traceback_text}")
+        run_time_sec = time.perf_counter() - run_started_at if run_started_at is not None else None
+        return StageStatus(
+            stage="serve",
+            success=False,
+            startup_time_sec=startup_time_sec,
+            run_time_sec=run_time_sec,
+            total_time_sec=time.perf_counter() - startup_started_at,
+            error=error,
+        )
     finally:
-        ray.get(explorer.shutdown.remote())
+        ray.get(explorer.shutdown.remote(), timeout=timeout)
 
 
-def both(config: Config) -> None:
+def both(config: Config) -> StageStatus:
     """Setup both explorer and trainer.
 
     For the explorer, a step contains `batch_size * sync_interval` number
@@ -107,6 +232,7 @@ def both(config: Config) -> None:
 
     explorer = Explorer.get_actor(config)
     trainer = Trainer.get_actor(config)
+    started_at = time.perf_counter()
     try:
         ray.get([explorer.__ray_ready__.remote(), trainer.__ray_ready__.remote()])
         ray.get(
@@ -148,8 +274,20 @@ def both(config: Config) -> None:
                 "==============================================================="
             )
             ray.wait(wait_ref, timeout=config.synchronizer.sync_timeout)
-    except Exception:
-        logger.error(f"Explorer or Trainer failed:\n{traceback.format_exc()}")
+        return StageStatus(
+            stage="both",
+            success=True,
+            total_time_sec=time.perf_counter() - started_at,
+        )
+    except Exception as exc:
+        error = _build_stage_error(exc)
+        logger.error(f"Explorer or Trainer failed:\n{error.traceback_text}")
+        return StageStatus(
+            stage="both",
+            success=False,
+            total_time_sec=time.perf_counter() - started_at,
+            error=error,
+        )
     finally:
         ray.wait(
             [explorer.shutdown.remote(), trainer.shutdown.remote()],
@@ -168,7 +306,7 @@ def both(config: Config) -> None:
 }
 
 
-def run_stage(config: Config) -> None:
+def run_stage(config: Config) -> StageStatus:
     ray.init(
         address=config.cluster.ray_address,
         ignore_reinit_error=True,
@@ -180,7 +318,7 @@ def run_stage(config: Config) -> None:
         from trinity.buffer.pipelines.task_pipeline import check_and_run_task_pipeline
 
         check_and_run_task_pipeline(config)
-        MODE_MAP[config.mode](config)
+        return MODE_MAP[config.mode](config)  # type: ignore[operator]
     finally:
         if config.monitor.enable_ray_timeline:
             timeline_file = os.path.join(config.monitor.cache_dir, "timeline.json")
@@ -287,12 +425,14 @@ def perf(
     ],
     module: Annotated[
         str,
-        typer.Option("--module", "-m", help="Perf module to run. Currently only supports 'explorer'."),
+        typer.Option(
+            "--module", "-m", help="Perf module to run. Currently only supports 'explorer'."
+        ),
     ] = "explorer",
     output_path: Annotated[
         str,
         typer.Option("--output-path", "-o", help="Path to the output JSON file."),
-    ] = "perf/scripts/explorer/output/perf_result.json",
+    ] = "./perf/output.json",
     monitor_interval: Annotated[
         float,
         typer.Option("--monitor-interval", help="Resource sampling interval in seconds."),
@@ -303,7 +443,9 @@ def perf(
     ] = 5,
     timeout: Annotated[
         Optional[float],
-        typer.Option("--timeout", help="Optional timeout in seconds for prepare, sync and explore calls."),
+        typer.Option(
+            "--timeout", help="Optional timeout in seconds for prepare, sync and explore calls."
+        ),
     ] = None,
     plugin_dir: Annotated[
         Optional[str],
@@ -314,20 +456,30 @@ def perf(
     if module != "explorer":
         raise typer.BadParameter("Only --module explorer is supported for now.")
 
-    if plugin_dir:
-        os.environ[PLUGIN_DIRS_ENV_VAR] = plugin_dir
-
-    options = ExplorerPerfOptions(
-        config_path=config,
-        output_path=output_path,
-        monitor_interval=monitor_interval,
-        total_steps=total_steps,
-        timeout=timeout,
-    )
-    payload = run_explorer_perf(options)
-    write_explorer_perf_output(output_path, payload)
+    try:
+        if plugin_dir:
+            os.environ[PLUGIN_DIRS_ENV_VAR] = plugin_dir
+
+        options = ExplorerPerfOptions(
+            config_path=config,
+            output_path=output_path,
+            monitor_interval=monitor_interval,
+            total_steps=total_steps,
+            timeout=timeout,
+        )
+        payload = run_explorer_perf(options)
+        write_explorer_perf_output(output_path, payload)
+    except Exception:
+        payload = {
+            "status": {
+                "success": False,
+                "error": traceback.format_exc(),
+            },
+            "data": None,
+        }
+        write_explorer_perf_output(output_path, payload)
     if not payload["status"]["success"]:
-        raise typer.Exit(code=1)
+        typer.echo(f"Failed to run perf: {payload['status']['error']}")
 
 
 @app.command()
diff --git a/trinity/explorer/explorer.py b/trinity/explorer/explorer.py
index db82c83067a..bf4dc95768b 100644
--- a/trinity/explorer/explorer.py
+++ b/trinity/explorer/explorer.py
@@ -382,14 +382,14 @@ async def _finish_steps(self, start_step: int, end_step: int, model_version: int
 
         # Record the time: read_task + explore_step (>=1) + eval (if any)
         if self.explore_start_time is not None:
-            metric = {"time/explorer_sync_interval": time.time() - self.explore_start_time}
+            metric = {"explore/time/sync_interval": time.time() - self.explore_start_time}
             self.explore_start_time = None
             if self.monitor is not None:
                 self.monitor.log(metric, step=end_step)
 
     async def _finish_explore_step(self, step: int, model_version: int) -> None:
         metric = {"rollout/model_version": model_version}
-        with Timer(metric, "time/wait_explore_step"):
+        with Timer(metric, "explore/time/wait_explore_step"):
             statuses, exps = await self.scheduler.get_results(
                 batch_id=step,
                 min_num=self.min_wait_num,
diff --git a/trinity/perf/__init__.py b/trinity/perf/__init__.py
index d638a9f35e1..2c0095122a2 100644
--- a/trinity/perf/__init__.py
+++ b/trinity/perf/__init__.py
@@ -1,17 +1,17 @@
 """Performance tooling package for Trinity."""
 
-from trinity.perf.explorer_metrics import (
-    TensorBoardScalarReader,
-    build_global_metrics,
-    collect_step_metrics,
-)
-from trinity.perf.explorer_perf import (
+from .report_utils import build_resource_timeline_payload
+from .resource_sampler import ResourceSampler
+from .stage_perf import (
     ExplorerPerfOptions,
     run_explorer_perf,
     write_explorer_perf_output,
 )
-from trinity.perf.report_utils import build_resource_timeline_payload
-from trinity.perf.resource_sampler import ResourceSampler
+from .tensorboard_metrics import (
+    TensorBoardScalarReader,
+    build_global_metrics,
+    collect_step_metrics,
+)
 
 __all__ = [
     "ExplorerPerfOptions",
@@ -22,4 +22,4 @@
     "collect_step_metrics",
     "run_explorer_perf",
     "write_explorer_perf_output",
-]
\ No newline at end of file
+]
diff --git a/trinity/perf/report_utils.py b/trinity/perf/report_utils.py
index bc253f80f70..7cc5bd2a066 100644
--- a/trinity/perf/report_utils.py
+++ b/trinity/perf/report_utils.py
@@ -56,4 +56,4 @@ def build_resource_timeline_payload(samples: list[ResourceSample]) -> dict:
                 for gpu_id, values in gpu_memory_series.items()
             },
         },
-    }
\ No newline at end of file
+    }
diff --git a/trinity/perf/resource_backends.py b/trinity/perf/resource_backends.py
index 913fc324347..83839343b39 100644
--- a/trinity/perf/resource_backends.py
+++ b/trinity/perf/resource_backends.py
@@ -116,4 +116,4 @@ def sample(self) -> ResourceSample:
             memory_rss_mb=float(memory_info.rss) / (1024 * 1024),
             memory_percent=float(self._process.memory_percent()),
             gpu_metrics=gpu_metrics,
-        )
\ No newline at end of file
+        )
diff --git a/trinity/perf/resource_sampler.py b/trinity/perf/resource_sampler.py
index 19b4073b144..c6ccd285c67 100644
--- a/trinity/perf/resource_sampler.py
+++ b/trinity/perf/resource_sampler.py
@@ -66,4 +66,4 @@ def _run(self) -> None:
     def _collect_once(self) -> None:
         sample = self.backend.sample()
         with self._lock:
-            self._samples.append(sample)
\ No newline at end of file
+            self._samples.append(sample)
diff --git a/trinity/perf/explorer_perf.py b/trinity/perf/stage_perf.py
similarity index 74%
rename from trinity/perf/explorer_perf.py
rename to trinity/perf/stage_perf.py
index 8aeab1f4b5a..edaf7c1de41 100644
--- a/trinity/perf/explorer_perf.py
+++ b/trinity/perf/stage_perf.py
@@ -10,18 +10,16 @@
 from typing import Any, Optional
 
 import ray
-from ray.exceptions import GetTimeoutError, RayTaskError
 
 from trinity.buffer.pipelines.task_pipeline import check_and_run_task_pipeline
 from trinity.common.config import Config, load_config
-from trinity.explorer.explorer import Explorer
-from trinity.perf.explorer_metrics import (
+from trinity.perf.report_utils import build_resource_timeline_payload
+from trinity.perf.resource_sampler import ResourceSampler
+from trinity.perf.tensorboard_metrics import (
     TensorBoardScalarReader,
     build_global_metrics,
     collect_step_metrics,
 )
-from trinity.perf.report_utils import build_resource_timeline_payload
-from trinity.perf.resource_sampler import ResourceSampler
 from trinity.utils.plugin_loader import load_plugins
 
 
@@ -102,24 +100,23 @@ def write_explorer_perf_output(output_path: str, payload: dict[str, Any]) -> Non
 
 def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
     """Run Explorer perf collection and return the result payload."""
+    from trinity.cli.launcher import explore
+
     load_plugins()
     config: Optional[Config] = None
-    explorer_actor = None
     sampler: Optional[ResourceSampler] = None
-    success = False
     error: Optional[str] = None
     startup_time_sec: Optional[float] = None
     run_time_sec: Optional[float] = None
     total_time_sec: Optional[float] = None
     resource_payload: dict[str, Any] = {"resource_timeline": [], "chart_series": {}}
     step_metrics: list[dict[str, Any]] = []
-    startup_started_at: Optional[float] = None
-    run_started_at: Optional[float] = None
 
     try:
         config = load_config(options.config_path)
         config.buffer.total_steps = options.total_steps
         config.monitor.monitor_type = "tensorboard"
+        config.continue_from_checkpoint = False  # ensure we start fresh for perf testing
         validate_explorer_perf_config(config)
         config.check_and_update()
 
@@ -134,36 +131,18 @@ def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
         sampler = ResourceSampler(interval_seconds=options.monitor_interval)
         sampler.start()
 
-        startup_started_at = time.perf_counter()
-        explorer_actor = Explorer.get_actor(config)
-        ray.get(explorer_actor.prepare.remote(), timeout=options.timeout)
-        startup_time_sec = time.perf_counter() - startup_started_at
-
-        run_started_at = time.perf_counter()
-        ray.get(explorer_actor.sync_weight.remote(), timeout=options.timeout)
-        ray.get(explorer_actor.explore.remote(), timeout=options.timeout)
-        run_time_sec = time.perf_counter() - run_started_at
-        total_time_sec = time.perf_counter() - startup_started_at
-        success = True
-    except (RuntimeError, ValueError, TimeoutError, GetTimeoutError, RayTaskError):
+        stage_status = explore(config, timeout=options.timeout)
+        startup_time_sec = stage_status.startup_time_sec
+        run_time_sec = stage_status.run_time_sec
+        total_time_sec = stage_status.total_time_sec
+        if stage_status.error is not None:
+            error = stage_status.error.traceback_text
+    except (RuntimeError, ValueError):
         error = traceback.format_exc()
-        if startup_started_at is not None and startup_time_sec is None and run_started_at is None:
-            startup_time_sec = time.perf_counter() - startup_started_at
-        if run_started_at is not None and run_time_sec is None:
-            run_time_sec = time.perf_counter() - run_started_at
-        if startup_started_at is not None and total_time_sec is None:
-            total_time_sec = time.perf_counter() - startup_started_at
     finally:
         collected_samples = sampler.stop() if sampler is not None else []
         resource_payload = build_resource_timeline_payload(collected_samples)
 
-        if explorer_actor is not None and ray.is_initialized():
-            try:
-                ray.get(explorer_actor.shutdown.remote(), timeout=options.timeout)
-            except (RuntimeError, TimeoutError, GetTimeoutError, RayTaskError):
-                if error is None:
-                    error = traceback.format_exc()
-
         if config is not None:
             tensorboard_dir = os.path.join(
                 config.monitor.cache_dir, "tensorboard", config.explorer.name
@@ -183,6 +162,6 @@ def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
         total_time_sec=total_time_sec,
         resource_payload=resource_payload,
         step_metrics=step_metrics,
-        success=success,
+        success=error is None,
         error=error,
-    )
\ No newline at end of file
+    )
diff --git a/trinity/perf/explorer_metrics.py b/trinity/perf/tensorboard_metrics.py
similarity index 98%
rename from trinity/perf/explorer_metrics.py
rename to trinity/perf/tensorboard_metrics.py
index 84149fe33fa..9c0d7b011f7 100644
--- a/trinity/perf/explorer_metrics.py
+++ b/trinity/perf/tensorboard_metrics.py
@@ -1,4 +1,4 @@
-"""Helpers for Explorer performance metric parsing and aggregation."""
+"""Helpers for TensorBoard metric parsing and aggregation."""
 
 from __future__ import annotations
 
@@ -111,4 +111,4 @@ def build_global_metrics(step_metrics: list[dict[str, Any]]) -> dict[str, Option
         "overall_throughput_task_per_min": overall_throughput,
         "overall_avg_task_time_sec": overall_avg_task_time,
         "total_step_time_sec": total_step_time_sec if total_step_time_sec > 0 else None,
-    }
\ No newline at end of file
+    }

From 60e6743f2621ad1a3da5fd3eea4fb103009f5707 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Tue, 28 Apr 2026 21:05:07 +0800
Subject: [PATCH 05/20] fix metrics

---
 perf/scripts/explorer/README.md     |  4 +-
 trinity/cli/launcher.py             |  2 +-
 trinity/perf/resource_backends.py   | 61 ++++++++++++++++++++++++-----
 trinity/perf/stage_perf.py          | 29 ++++++++------
 trinity/perf/tensorboard_metrics.py | 48 +++++------------------
 5 files changed, 80 insertions(+), 64 deletions(-)

diff --git a/perf/scripts/explorer/README.md b/perf/scripts/explorer/README.md
index 27556ffd09a..c7beddbd30d 100644
--- a/perf/scripts/explorer/README.md
+++ b/perf/scripts/explorer/README.md
@@ -186,7 +186,7 @@ Explorer 运行指标优先从 TensorBoard 事件文件提取，原因如下：
 记录关键耗时：
 
 1. `startup_time_sec`
-2. `run_time_sec`
+2. `execution_time_sec`
 3. `total_time_sec`
 
 ### `resource_timeline`
@@ -294,7 +294,7 @@ python -m trinity.cli.launcher perf --module explorer --config <path_to_config>
   },
   "timing": {
     "startup_time_sec": 32.5,
-    "run_time_sec": 640.2,
+    "execution_time_sec": 640.2,
     "total_time_sec": 672.7
   },
   "resource_timeline": [
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
index 334ea71d9ed..d0f2e4ed091 100644
--- a/trinity/cli/launcher.py
+++ b/trinity/cli/launcher.py
@@ -479,7 +479,7 @@ def perf(
         }
         write_explorer_perf_output(output_path, payload)
     if not payload["status"]["success"]:
-        typer.echo(f"Failed to run perf: {payload['status']['error']}")
+        typer.echo(f"Failed to run perf: {payload['status']['message']}")
 
 
 @app.command()
diff --git a/trinity/perf/resource_backends.py b/trinity/perf/resource_backends.py
index 83839343b39..817a2fd6f86 100644
--- a/trinity/perf/resource_backends.py
+++ b/trinity/perf/resource_backends.py
@@ -4,6 +4,7 @@
 
 import time
 from dataclasses import asdict, dataclass
+from typing import Iterable
 
 import psutil
 from pynvml import (
@@ -53,10 +54,20 @@ def to_dict(self) -> dict:
 class SystemResourceBackend:
     """Collect system-level CPU, memory and per-GPU metrics."""
 
-    def __init__(self) -> None:
+    def __init__(
+        self,
+        gpu_subsample_count: int = 5,
+        gpu_subsample_interval_seconds: float = 0.2,
+    ) -> None:
+        if gpu_subsample_count <= 0:
+            raise ValueError("gpu_subsample_count must be greater than 0.")
+        if gpu_subsample_interval_seconds < 0:
+            raise ValueError("gpu_subsample_interval_seconds must be non-negative.")
         self._process = psutil.Process()
         self._initialized = False
         self._gpu_count = 0
+        self._gpu_subsample_count = gpu_subsample_count
+        self._gpu_subsample_interval_seconds = gpu_subsample_interval_seconds
 
     def open(self) -> None:
         """Initialize the GPU management library and validate the environment."""
@@ -92,6 +103,22 @@ def sample(self) -> ResourceSample:
 
         timestamp = time.time()
         memory_info = self._process.memory_info()
+        gpu_metrics = self._collect_gpu_metrics()
+        for _ in range(1, self._gpu_subsample_count):
+            if self._gpu_subsample_interval_seconds > 0:
+                time.sleep(self._gpu_subsample_interval_seconds)
+            gpu_metrics = self._merge_gpu_metrics(gpu_metrics, self._collect_gpu_metrics())
+
+        return ResourceSample(
+            timestamp=timestamp,
+            cpu_percent=float(self._process.cpu_percent(interval=None)),
+            memory_rss_mb=float(memory_info.rss) / (1024 * 1024),
+            memory_percent=float(self._process.memory_percent()),
+            gpu_metrics=gpu_metrics,
+        )
+
+    def _collect_gpu_metrics(self) -> list[GPUSample]:
+        """Collect one instantaneous GPU snapshot from NVML."""
         gpu_metrics: list[GPUSample] = []
         for gpu_index in range(self._gpu_count):
             gpu_handle = nvmlDeviceGetHandleByIndex(gpu_index)
@@ -109,11 +136,27 @@ def sample(self) -> ResourceSample:
                     gpu_memory_total_mb=float(gpu_memory.total) / (1024 * 1024),
                 )
             )
-
-        return ResourceSample(
-            timestamp=timestamp,
-            cpu_percent=float(self._process.cpu_percent(interval=None)),
-            memory_rss_mb=float(memory_info.rss) / (1024 * 1024),
-            memory_percent=float(self._process.memory_percent()),
-            gpu_metrics=gpu_metrics,
-        )
+        return gpu_metrics
+
+    def _merge_gpu_metrics(
+        self,
+        base_metrics: Iterable[GPUSample],
+        next_metrics: Iterable[GPUSample],
+    ) -> list[GPUSample]:
+        """Merge GPU snapshots by keeping peak utilization and memory usage."""
+        merged_metrics = {gpu_metric.gpu_id: gpu_metric for gpu_metric in base_metrics}
+        for gpu_metric in next_metrics:
+            prior_metric = merged_metrics.get(gpu_metric.gpu_id)
+            if prior_metric is None:
+                merged_metrics[gpu_metric.gpu_id] = gpu_metric
+                continue
+            merged_metrics[gpu_metric.gpu_id] = GPUSample(
+                gpu_id=gpu_metric.gpu_id,
+                name=gpu_metric.name,
+                gpu_util_percent=max(prior_metric.gpu_util_percent, gpu_metric.gpu_util_percent),
+                gpu_memory_used_mb=max(
+                    prior_metric.gpu_memory_used_mb, gpu_metric.gpu_memory_used_mb
+                ),
+                gpu_memory_total_mb=gpu_metric.gpu_memory_total_mb,
+            )
+        return [merged_metrics[gpu_index] for gpu_index in sorted(merged_metrics)]
diff --git a/trinity/perf/stage_perf.py b/trinity/perf/stage_perf.py
index edaf7c1de41..74f6a0f8a93 100644
--- a/trinity/perf/stage_perf.py
+++ b/trinity/perf/stage_perf.py
@@ -40,11 +40,11 @@ def validate_explorer_perf_config(config: Config) -> None:
 
 def build_explorer_perf_payload(
     *,
-    config: Optional[Config],
+    config: Config,
     options: ExplorerPerfOptions,
-    startup_time_sec: Optional[float],
-    run_time_sec: Optional[float],
-    total_time_sec: Optional[float],
+    startup_time_sec: float,
+    execution_time_sec: float,
+    total_time_sec: float,
     resource_payload: dict[str, Any],
     step_metrics: list[dict[str, Any]],
     success: bool,
@@ -66,6 +66,7 @@ def build_explorer_perf_payload(
 
     return {
         "run_meta": {
+            "module": "explorer",
             "config_path": options.config_path,
             "explorer_name": explorer_name,
             "monitor_interval_sec": options.monitor_interval,
@@ -75,12 +76,12 @@ def build_explorer_perf_payload(
         },
         "timing": {
             "startup_time_sec": startup_time_sec,
-            "run_time_sec": run_time_sec,
+            "execution_time_sec": execution_time_sec,
             "total_time_sec": total_time_sec,
         },
         **resource_payload,
         "step_metrics": step_metrics,
-        "global_metrics": build_global_metrics(step_metrics),
+        "global_metrics": build_global_metrics(step_metrics, execution_time_sec=execution_time_sec),
         "artifacts": artifacts,
         "status": {
             "success": success,
@@ -103,12 +104,12 @@ def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
     from trinity.cli.launcher import explore
 
     load_plugins()
-    config: Optional[Config] = None
+    config: Config = None
     sampler: Optional[ResourceSampler] = None
     error: Optional[str] = None
-    startup_time_sec: Optional[float] = None
-    run_time_sec: Optional[float] = None
-    total_time_sec: Optional[float] = None
+    startup_time_sec: float = None
+    execution_time_sec: float = None
+    total_time_sec: float = None
     resource_payload: dict[str, Any] = {"resource_timeline": [], "chart_series": {}}
     step_metrics: list[dict[str, Any]] = []
 
@@ -133,12 +134,14 @@ def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
 
         stage_status = explore(config, timeout=options.timeout)
         startup_time_sec = stage_status.startup_time_sec
-        run_time_sec = stage_status.run_time_sec
+        execution_time_sec = stage_status.execution_time_sec
         total_time_sec = stage_status.total_time_sec
         if stage_status.error is not None:
             error = stage_status.error.traceback_text
-    except (RuntimeError, ValueError):
+    except (RuntimeError, ValueError) as e:
         error = traceback.format_exc()
+        print(f"Explorer perf failed with error: {e}\n{error}")
+        raise e
     finally:
         collected_samples = sampler.stop() if sampler is not None else []
         resource_payload = build_resource_timeline_payload(collected_samples)
@@ -158,7 +161,7 @@ def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
         config=config,
         options=options,
         startup_time_sec=startup_time_sec,
-        run_time_sec=run_time_sec,
+        execution_time_sec=execution_time_sec,
         total_time_sec=total_time_sec,
         resource_payload=resource_payload,
         step_metrics=step_metrics,
diff --git a/trinity/perf/tensorboard_metrics.py b/trinity/perf/tensorboard_metrics.py
index 9c0d7b011f7..a3926290346 100644
--- a/trinity/perf/tensorboard_metrics.py
+++ b/trinity/perf/tensorboard_metrics.py
@@ -8,10 +8,8 @@
 
 from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
 
-STEP_TIME_METRIC_CANDIDATES = (
-    "time/wait_explore_step",
-    "time/explorer_sync_interval",
-)
+TASK_EXECUTION_METRIC_NAME = "rollout/time/task_execution/mean"
+RUN_EXECUTION_METRIC_NAME = "rollout/time/run_execution/mean"
 FINISHED_TASK_METRIC_NAME = "rollout/finished_task_count"
 
 
@@ -43,14 +41,6 @@ def _find_event_files(self, log_dir: str) -> list[str]:
         return sorted(event_files)
 
 
-def get_step_time(metric_map: dict[str, dict[int, float]], step: int) -> Optional[float]:
-    """Select the best available step duration metric for one step."""
-    for metric_name in STEP_TIME_METRIC_CANDIDATES:
-        if step in metric_map.get(metric_name, {}):
-            return float(metric_map[metric_name][step])
-    return None
-
-
 def extract_raw_metrics_for_step(
     metric_map: dict[str, dict[int, float]], step: int
 ) -> dict[str, float]:
@@ -68,47 +58,27 @@ def collect_step_metrics(metric_map: dict[str, dict[int, float]]) -> list[dict[s
     step_metrics: list[dict[str, Any]] = []
     for step in step_numbers:
         finished_task_count = float(metric_map[FINISHED_TASK_METRIC_NAME][step])
-        step_time_sec = get_step_time(metric_map, step)
+        time_per_task = float(metric_map[TASK_EXECUTION_METRIC_NAME][step])
+        time_per_run = float(metric_map[RUN_EXECUTION_METRIC_NAME][step])
         step_metrics.append(
             {
                 "step": step,
                 "finished_task_count": finished_task_count,
-                "step_time_sec": step_time_sec,
-                "throughput_task_per_min": (
-                    finished_task_count / step_time_sec * 60.0
-                    if step_time_sec is not None and step_time_sec > 0 and finished_task_count > 0
-                    else None
-                ),
-                "avg_task_time_sec": (
-                    step_time_sec / finished_task_count
-                    if step_time_sec is not None and step_time_sec > 0 and finished_task_count > 0
-                    else None
-                ),
+                "time_per_task": time_per_task,
+                "time_per_run": time_per_run,
                 "raw_metrics": extract_raw_metrics_for_step(metric_map, step),
             }
         )
     return step_metrics
 
 
-def build_global_metrics(step_metrics: list[dict[str, Any]]) -> dict[str, Optional[float]]:
+def build_global_metrics(
+    step_metrics: list[dict[str, Any]], execution_time_sec: float
+) -> dict[str, Optional[float]]:
     """Aggregate global metrics from per-step records."""
     total_finished_task_count = float(
         sum(step_metric["finished_task_count"] for step_metric in step_metrics)
     )
-    total_step_time_sec = sum(
-        step_metric["step_time_sec"]
-        for step_metric in step_metrics
-        if step_metric["step_time_sec"] is not None
-    )
-    if total_finished_task_count > 0 and total_step_time_sec > 0:
-        overall_throughput = total_finished_task_count / total_step_time_sec * 60.0
-        overall_avg_task_time = total_step_time_sec / total_finished_task_count
-    else:
-        overall_throughput = None
-        overall_avg_task_time = None
     return {
         "total_finished_task_count": total_finished_task_count,
-        "overall_throughput_task_per_min": overall_throughput,
-        "overall_avg_task_time_sec": overall_avg_task_time,
-        "total_step_time_sec": total_step_time_sec if total_step_time_sec > 0 else None,
     }

From 7f654906a0cd3d0f98c73cf0c0016db1cb0dee57 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Tue, 28 Apr 2026 21:22:30 +0800
Subject: [PATCH 06/20] add viewer

---
 trinity/cli/launcher.py             | 42 ++++++++++++++++-------------
 trinity/perf/__init__.py            |  2 --
 trinity/perf/stage_perf.py          |  2 --
 trinity/perf/tensorboard_metrics.py | 27 +++----------------
 4 files changed, 27 insertions(+), 46 deletions(-)

diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
index d0f2e4ed091..9c2bcd132ca 100644
--- a/trinity/cli/launcher.py
+++ b/trinity/cli/launcher.py
@@ -46,7 +46,7 @@ class StageStatus:
     stage: str
     success: bool
     startup_time_sec: Optional[float] = None
-    run_time_sec: Optional[float] = None
+    execution_time_sec: Optional[float] = None
     total_time_sec: Optional[float] = None
     error: Optional[StageError] = None
 
@@ -73,13 +73,13 @@ def bench(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
 
         run_started_at = time.perf_counter()
         ray.get(explorer.benchmark.remote(), timeout=timeout)
-        run_time_sec = time.perf_counter() - run_started_at
+        execution_time_sec = time.perf_counter() - run_started_at
         logger.info("Benchmark finished.")
         return StageStatus(
             stage="bench",
             success=True,
             startup_time_sec=startup_time_sec,
-            run_time_sec=run_time_sec,
+            execution_time_sec=execution_time_sec,
             total_time_sec=time.perf_counter() - startup_started_at,
         )
     except Exception as exc:
@@ -89,7 +89,7 @@ def bench(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
             stage="bench",
             success=False,
             startup_time_sec=startup_time_sec,
-            run_time_sec=None,
+            execution_time_sec=None,
             total_time_sec=time.perf_counter() - startup_started_at,
             error=error,
         )
@@ -113,23 +113,25 @@ def explore(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
         run_started_at = time.perf_counter()
         ray.get(explorer.sync_weight.remote(), timeout=timeout)
         ray.get(explorer.explore.remote(), timeout=timeout)
-        run_time_sec = time.perf_counter() - run_started_at
+        execution_time_sec = time.perf_counter() - run_started_at
         return StageStatus(
             stage="explore",
             success=True,
             startup_time_sec=startup_time_sec,
-            run_time_sec=run_time_sec,
+            execution_time_sec=execution_time_sec,
             total_time_sec=time.perf_counter() - startup_started_at,
         )
     except Exception as exc:
         error = _build_stage_error(exc)
         logger.error(f"Explorer failed:\n{error.traceback_text}")
-        run_time_sec = time.perf_counter() - run_started_at if run_started_at is not None else None
+        execution_time_sec = (
+            time.perf_counter() - run_started_at if run_started_at is not None else None
+        )
         return StageStatus(
             stage="explore",
             success=False,
             startup_time_sec=startup_time_sec,
-            run_time_sec=run_time_sec,
+            execution_time_sec=execution_time_sec,
             total_time_sec=time.perf_counter() - startup_started_at,
             error=error,
         )
@@ -153,23 +155,25 @@ def train(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
         run_started_at = time.perf_counter()
         ray.get(trainer.sync_weight.remote(), timeout=timeout)
         ray.get(trainer.train.remote(), timeout=timeout)
-        run_time_sec = time.perf_counter() - run_started_at
+        execution_time_sec = time.perf_counter() - run_started_at
         return StageStatus(
             stage="train",
             success=True,
             startup_time_sec=startup_time_sec,
-            run_time_sec=run_time_sec,
+            execution_time_sec=execution_time_sec,
             total_time_sec=time.perf_counter() - startup_started_at,
         )
     except Exception as exc:
         error = _build_stage_error(exc)
         logger.error(f"Trainer failed:\n{error.traceback_text}")
-        run_time_sec = time.perf_counter() - run_started_at if run_started_at is not None else None
+        execution_time_sec = (
+            time.perf_counter() - run_started_at if run_started_at is not None else None
+        )
         return StageStatus(
             stage="train",
             success=False,
             startup_time_sec=startup_time_sec,
-            run_time_sec=run_time_sec,
+            execution_time_sec=execution_time_sec,
             total_time_sec=time.perf_counter() - startup_started_at,
             error=error,
         )
@@ -193,23 +197,25 @@ def serve(config: Config, *, timeout: Optional[float] = None) -> StageStatus:
         run_started_at = time.perf_counter()
         ray.get(explorer.sync_weight.remote(), timeout=timeout)
         ray.get(explorer.serve.remote(), timeout=timeout)
-        run_time_sec = time.perf_counter() - run_started_at
+        execution_time_sec = time.perf_counter() - run_started_at
         return StageStatus(
             stage="serve",
             success=True,
             startup_time_sec=startup_time_sec,
-            run_time_sec=run_time_sec,
+            execution_time_sec=execution_time_sec,
             total_time_sec=time.perf_counter() - startup_started_at,
         )
     except Exception as exc:
         error = _build_stage_error(exc)
         logger.error(f"Explorer failed:\n{error.traceback_text}")
-        run_time_sec = time.perf_counter() - run_started_at if run_started_at is not None else None
+        execution_time_sec = (
+            time.perf_counter() - run_started_at if run_started_at is not None else None
+        )
         return StageStatus(
             stage="serve",
             success=False,
             startup_time_sec=startup_time_sec,
-            run_time_sec=run_time_sec,
+            execution_time_sec=execution_time_sec,
             total_time_sec=time.perf_counter() - startup_started_at,
             error=error,
         )
@@ -436,7 +442,7 @@ def perf(
     monitor_interval: Annotated[
         float,
         typer.Option("--monitor-interval", help="Resource sampling interval in seconds."),
-    ] = 5.0,
+    ] = 2.0,
     total_steps: Annotated[
         int,
         typer.Option("--total-steps", help="Total steps to run the explorer for."),
@@ -479,7 +485,7 @@ def perf(
         }
         write_explorer_perf_output(output_path, payload)
     if not payload["status"]["success"]:
-        typer.echo(f"Failed to run perf: {payload['status']['message']}")
+        typer.echo(f"Failed to run perf: {payload['status']['error']}")
 
 
 @app.command()
diff --git a/trinity/perf/__init__.py b/trinity/perf/__init__.py
index 2c0095122a2..11505837685 100644
--- a/trinity/perf/__init__.py
+++ b/trinity/perf/__init__.py
@@ -9,7 +9,6 @@
 )
 from .tensorboard_metrics import (
     TensorBoardScalarReader,
-    build_global_metrics,
     collect_step_metrics,
 )
 
@@ -17,7 +16,6 @@
     "ExplorerPerfOptions",
     "ResourceSampler",
     "TensorBoardScalarReader",
-    "build_global_metrics",
     "build_resource_timeline_payload",
     "collect_step_metrics",
     "run_explorer_perf",
diff --git a/trinity/perf/stage_perf.py b/trinity/perf/stage_perf.py
index 74f6a0f8a93..f4b3382ba41 100644
--- a/trinity/perf/stage_perf.py
+++ b/trinity/perf/stage_perf.py
@@ -17,7 +17,6 @@
 from trinity.perf.resource_sampler import ResourceSampler
 from trinity.perf.tensorboard_metrics import (
     TensorBoardScalarReader,
-    build_global_metrics,
     collect_step_metrics,
 )
 from trinity.utils.plugin_loader import load_plugins
@@ -81,7 +80,6 @@ def build_explorer_perf_payload(
         },
         **resource_payload,
         "step_metrics": step_metrics,
-        "global_metrics": build_global_metrics(step_metrics, execution_time_sec=execution_time_sec),
         "artifacts": artifacts,
         "status": {
             "success": success,
diff --git a/trinity/perf/tensorboard_metrics.py b/trinity/perf/tensorboard_metrics.py
index a3926290346..28824f256ce 100644
--- a/trinity/perf/tensorboard_metrics.py
+++ b/trinity/perf/tensorboard_metrics.py
@@ -57,28 +57,7 @@ def collect_step_metrics(metric_map: dict[str, dict[int, float]]) -> list[dict[s
     step_numbers = sorted(metric_map.get(FINISHED_TASK_METRIC_NAME, {}).keys())
     step_metrics: list[dict[str, Any]] = []
     for step in step_numbers:
-        finished_task_count = float(metric_map[FINISHED_TASK_METRIC_NAME][step])
-        time_per_task = float(metric_map[TASK_EXECUTION_METRIC_NAME][step])
-        time_per_run = float(metric_map[RUN_EXECUTION_METRIC_NAME][step])
-        step_metrics.append(
-            {
-                "step": step,
-                "finished_task_count": finished_task_count,
-                "time_per_task": time_per_task,
-                "time_per_run": time_per_run,
-                "raw_metrics": extract_raw_metrics_for_step(metric_map, step),
-            }
-        )
+        metrics = extract_raw_metrics_for_step(metric_map, step)
+        metrics["step"] = step
+        step_metrics.append(metrics)
     return step_metrics
-
-
-def build_global_metrics(
-    step_metrics: list[dict[str, Any]], execution_time_sec: float
-) -> dict[str, Optional[float]]:
-    """Aggregate global metrics from per-step records."""
-    total_finished_task_count = float(
-        sum(step_metric["finished_task_count"] for step_metric in step_metrics)
-    )
-    return {
-        "total_finished_task_count": total_finished_task_count,
-    }

From 027aad9ee04f99363c0e98718fcbb980db4686c7 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Tue, 28 Apr 2026 21:23:32 +0800
Subject: [PATCH 07/20] fix pre-commit

---
 trinity/perf/__init__.py            | 5 +----
 trinity/perf/tensorboard_metrics.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/trinity/perf/__init__.py b/trinity/perf/__init__.py
index 11505837685..1af3fa790c7 100644
--- a/trinity/perf/__init__.py
+++ b/trinity/perf/__init__.py
@@ -7,10 +7,7 @@
     run_explorer_perf,
     write_explorer_perf_output,
 )
-from .tensorboard_metrics import (
-    TensorBoardScalarReader,
-    collect_step_metrics,
-)
+from .tensorboard_metrics import TensorBoardScalarReader, collect_step_metrics
 
 __all__ = [
     "ExplorerPerfOptions",
diff --git a/trinity/perf/tensorboard_metrics.py b/trinity/perf/tensorboard_metrics.py
index 28824f256ce..2fb69d3bd99 100644
--- a/trinity/perf/tensorboard_metrics.py
+++ b/trinity/perf/tensorboard_metrics.py
@@ -4,7 +4,7 @@
 
 import os
 from collections import defaultdict
-from typing import Any, Optional
+from typing import Any
 
 from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
 

From e6d9817b0c0c85d40bb7b614688413aff1c2c2cd Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 29 Apr 2026 10:07:33 +0800
Subject: [PATCH 08/20] add viewer

---
 trinity/perf/report_viewer.py | 350 ++++++++++++++++++++++++++++++++++
 1 file changed, 350 insertions(+)
 create mode 100644 trinity/perf/report_viewer.py

diff --git a/trinity/perf/report_viewer.py b/trinity/perf/report_viewer.py
new file mode 100644
index 00000000000..182f1da2440
--- /dev/null
+++ b/trinity/perf/report_viewer.py
@@ -0,0 +1,350 @@
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+
+import matplotlib.pyplot as plt
+import streamlit as st
+
+try:
+    from streamlit.runtime.scriptrunner import get_script_run_ctx
+except ImportError:  # pragma: no cover - fallback for streamlit runtime layout changes
+    from streamlit.runtime.scriptrunner_utils.script_run_context import (
+        get_script_run_ctx,
+    )
+
+STEP_METRIC_PREFIXES_BY_MODULE: dict[str, list[str]] = {
+    "explorer": [
+        "rollout/time/run_execution/mean",
+        "rollout/time/task_execution/mean",
+        "rollout/prompt_length/mean",
+        "rollout/response_length/mean",
+        "experience_pipeline/experience_count",
+    ],
+    "trainer": [],
+}
+
+MEMORY_SERIES_KEY = "memory_rss_mb"
+
+
+class PerfReportViewer:
+    @staticmethod
+    def run_viewer(report_path: str, port: int) -> None:
+        """Start the Streamlit perf report viewer."""
+        from streamlit.web import cli
+
+        viewer_path = Path(__file__)
+        sys.argv = [
+            "streamlit",
+            "run",
+            str(viewer_path.resolve()),
+            "--server.port",
+            str(port),
+            "--server.fileWatcherType",
+            "none",
+            "--",
+            "--report",
+            report_path,
+        ]
+        sys.exit(cli.main())
+
+
+def has_streamlit_context() -> bool:
+    return get_script_run_ctx() is not None
+
+
+if has_streamlit_context():
+    st.set_page_config(page_title="Trinity Performance Report", layout="wide")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Trinity Performance Report Viewer")
+    parser.add_argument("--report", type=str, required=True, help="Path to the perf report JSON.")
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8503,
+        help="Port used when auto-launching the Streamlit report viewer.",
+    )
+    return parser.parse_args()
+
+
+def load_report(report_path: str) -> dict[str, Any]:
+    report_file = Path(report_path)
+    if not report_file.exists():
+        raise FileNotFoundError(f"Report file not found: {report_path}")
+    with report_file.open("r", encoding="utf-8") as handle:
+        return json.load(handle)
+
+
+def infer_module_name(report: dict[str, Any]) -> str:
+    run_meta = report.get("run_meta", {})
+    return str(run_meta.get("module"))
+
+
+def get_step_metric_prefixes(report: dict[str, Any]) -> list[str]:
+    module_name = infer_module_name(report)
+    return STEP_METRIC_PREFIXES_BY_MODULE.get(module_name, [])
+
+
+def format_timestamp(timestamp: Optional[float]) -> str:
+    if timestamp is None:
+        return "N/A"
+    return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")
+
+
+def format_metric_value(value: Any) -> str:
+    if value is None:
+        return "N/A"
+    if isinstance(value, float):
+        return f"{value:.4f}"
+    return str(value)
+
+
+def metric_label(metric_name: str) -> str:
+    return metric_name.replace("_", " ").title()
+
+
+def gpu_series_label(gpu_payload: dict[str, Any]) -> str:
+    gpu_id = gpu_payload.get("gpu_id", "?")
+    gpu_name = gpu_payload.get("name")
+    if gpu_name:
+        return f"GPU {gpu_id} ({gpu_name})"
+    return f"GPU {gpu_id}"
+
+
+def render_metric_card(metric_name: str, value: Any) -> None:
+    display_value = format_metric_value(value)
+    label = metric_label(metric_name)
+    st.markdown(
+        f"""
+        <div style="
+            background: linear-gradient(135deg, #eef6ff 0%, #f7fbff 100%);
+            border: 1px solid #d7e6fb;
+            border-radius: 14px;
+            padding: 16px 18px;
+            min-height: 108px;
+            box-shadow: 0 1px 3px rgba(15, 23, 42, 0.08);
+        ">
+            <div style="font-size: 0.9rem; color: #4a5a70; margin-bottom: 10px;">{label}</div>
+            <div style="font-size: 1.6rem; font-weight: 700; color: #0f172a;">{display_value}</div>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+
+
+def build_elapsed_series(series: list[dict[str, Any]]) -> tuple[list[float], list[float]]:
+    if not series:
+        return [], []
+    start_timestamp = float(series[0]["timestamp"])
+    x_values = [float(point["timestamp"]) - start_timestamp for point in series]
+    y_values = [float(point["value"]) for point in series]
+    return x_values, y_values
+
+
+def render_line_chart(
+    title: str,
+    x_values: list[float],
+    y_series: dict[str, list[float]],
+    y_label: str,
+    legend_below: bool = False,
+    legend_columns: int = 1,
+) -> None:
+    st.markdown(f"#### {title}")
+    if not x_values or not y_series:
+        st.info(f"No data for {title}.")
+        return
+
+    figure, axis = plt.subplots(figsize=(6, 2.6))
+    for series_name, y_values in y_series.items():
+        axis.plot(x_values[: len(y_values)], y_values, label=series_name)
+    axis.set_xlabel("Elapsed Time (s)")
+    axis.set_ylabel(y_label)
+    axis.grid(True, alpha=0.3)
+    if len(y_series) > 1:
+        if legend_below:
+            axis.legend(
+                loc="upper center",
+                bbox_to_anchor=(0.5, -0.28),
+                ncol=min(legend_columns, len(y_series)),
+                frameon=False,
+                fontsize=8,
+            )
+            figure.subplots_adjust(bottom=0.32)
+        else:
+            axis.legend()
+    st.pyplot(figure, clear_figure=True)
+
+
+def render_step_metric_chart(step_metrics: list[dict[str, Any]], metric_key: str) -> None:
+    x_values = [
+        int(step_metric["step"]) for step_metric in step_metrics if metric_key in step_metric
+    ]
+    y_values = [
+        float(step_metric[metric_key])
+        for step_metric in step_metrics
+        if step_metric.get(metric_key) is not None
+    ]
+
+    st.markdown(f"#### {metric_label(metric_key)}")
+    if not x_values or not y_values:
+        st.info(f"No data for {metric_key}.")
+        return
+
+    figure, axis = plt.subplots(figsize=(6, 2.6))
+    axis.plot(x_values[: len(y_values)], y_values, marker="o")
+    axis.set_xlabel("Step")
+    axis.set_ylabel(metric_label(metric_key))
+    axis.grid(True, alpha=0.3)
+    st.pyplot(figure, clear_figure=True)
+
+
+def render_header(report: dict[str, Any], report_path: str) -> None:
+    run_meta = report.get("run_meta", {})
+    status = report.get("status", {})
+
+    st.title("Trinity Performance Report")
+    st.caption(f"Report: {report_path}")
+    st.caption(f"Generated At: {format_timestamp(run_meta.get('generated_at'))}")
+
+    if not status.get("success"):
+        st.error("Run failed.")
+        if status.get("error"):
+            with st.expander("Error Traceback"):
+                st.code(str(status["error"]))
+
+
+def render_global_metrics(report: dict[str, Any]) -> None:
+    st.header("Global Metrics")
+    timing = report.get("timing", {})
+
+    metric_items: list[tuple[str, Any]] = []
+    metric_items.extend(
+        (
+            metric_key,
+            timing.get(metric_key),
+        )
+        for metric_key in ("startup_time_sec", "execution_time_sec", "total_time_sec")
+    )
+
+    shown_items = [(key, value) for key, value in metric_items if value is not None]
+    if not shown_items:
+        st.info("No global metrics found in this report.")
+        return
+
+    columns = st.columns(min(4, len(shown_items)))
+    for index, (metric_key, value) in enumerate(shown_items):
+        with columns[index % len(columns)]:
+            render_metric_card(metric_key, value)
+
+
+def render_step_metrics(report: dict[str, Any]) -> None:
+    st.header("Step Metrics")
+    step_metrics = report.get("step_metrics", [])
+    if not step_metrics:
+        st.info("No step metrics found in this report.")
+        return
+
+    metric_prefixes = get_step_metric_prefixes(report)
+    metric_keys: list[str] = []
+    for step_metric in step_metrics:
+        for metric_key, metric_value in step_metric.items():
+            if metric_key in {"step", "raw_metrics"} or metric_value is None:
+                continue
+            if any(metric_key.startswith(prefix) for prefix in metric_prefixes):
+                if metric_key not in metric_keys:
+                    metric_keys.append(metric_key)
+
+    if not metric_keys:
+        st.info("No configured step metrics matched the current report.")
+        return
+
+    for metric_index in range(0, len(metric_keys), 2):
+        columns = st.columns(2)
+        for column_index, metric_key in enumerate(metric_keys[metric_index : metric_index + 2]):
+            with columns[column_index]:
+                render_step_metric_chart(step_metrics, metric_key)
+
+    with st.expander("Step Metrics Table"):
+        compact_rows = []
+        for step_metric in step_metrics:
+            compact_row = {key: value for key, value in step_metric.items() if key != "raw_metrics"}
+            compact_rows.append(compact_row)
+        st.dataframe(compact_rows, use_container_width=True)
+
+
+def render_resource_utilization(report: dict[str, Any]) -> None:
+    st.header("Resource Utilization")
+    chart_series = report.get("chart_series", {})
+
+    cpu_series = chart_series.get("cpu_percent", [])
+    cpu_x, cpu_y = build_elapsed_series(cpu_series)
+
+    memory_series = chart_series.get(MEMORY_SERIES_KEY, [])
+    memory_x, memory_y = build_elapsed_series(memory_series)
+
+    gpu_util_series = chart_series.get("gpu_util_percent", {})
+    gpu_util_x: list[float] = []
+    gpu_util_y: dict[str, list[float]] = {}
+    for gpu_payload in gpu_util_series.values():
+        gpu_util_x, values = build_elapsed_series(gpu_payload.get("values", []))
+        gpu_util_y[gpu_series_label(gpu_payload)] = values
+    gpu_memory_series = chart_series.get("gpu_memory_used_mb", {})
+    gpu_memory_x: list[float] = []
+    gpu_memory_y: dict[str, list[float]] = {}
+    for gpu_payload in gpu_memory_series.values():
+        gpu_memory_x, values = build_elapsed_series(gpu_payload.get("values", []))
+        gpu_memory_y[gpu_series_label(gpu_payload)] = values
+    first_row = st.columns(2)
+    with first_row[0]:
+        render_line_chart("CPU Utilization", cpu_x, {"CPU": cpu_y}, "CPU %")
+    with first_row[1]:
+        render_line_chart("Memory Usage", memory_x, {"Memory": memory_y}, "MB")
+
+    second_row = st.columns(2)
+    with second_row[0]:
+        render_line_chart(
+            "GPU Utilization",
+            gpu_util_x,
+            gpu_util_y,
+            "GPU %",
+            legend_below=True,
+            legend_columns=2,
+        )
+    with second_row[1]:
+        render_line_chart(
+            "GPU Memory Usage",
+            gpu_memory_x,
+            gpu_memory_y,
+            "MB",
+            legend_below=True,
+            legend_columns=2,
+        )
+
+
+def main(args: Optional[argparse.Namespace] = None) -> None:
+    if args is None:
+        args = parse_args()
+
+    try:
+        report = load_report(args.report)
+    except (FileNotFoundError, json.JSONDecodeError, OSError) as error:
+        st.title("Trinity Perf Report Viewer")
+        st.error(str(error))
+        return
+
+    render_header(report, args.report)
+    render_global_metrics(report)
+    render_step_metrics(report)
+    render_resource_utilization(report)
+
+
+if __name__ == "__main__":
+    parsed_args = parse_args()
+    if has_streamlit_context():
+        main(parsed_args)
+    else:
+        PerfReportViewer.run_viewer(parsed_args.report, parsed_args.port)

From 6a8bf1697f1716a346ef66b1c85180c6ac4ccf4b Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 29 Apr 2026 10:15:35 +0800
Subject: [PATCH 09/20] add  and

---
 trinity/cli/launcher.py       | 72 +--------------------------
 trinity/cli/perf.py           | 94 +++++++++++++++++++++++++++++++++++
 trinity/perf/report_viewer.py | 13 +++--
 3 files changed, 106 insertions(+), 73 deletions(-)
 create mode 100644 trinity/cli/perf.py

diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
index 9c2bcd132ca..3b8238f1520 100644
--- a/trinity/cli/launcher.py
+++ b/trinity/cli/launcher.py
@@ -12,15 +12,11 @@
 import typer
 from typing_extensions import Annotated
 
+from trinity.cli.perf import perf_app
 from trinity.common.config import Config, load_config
 from trinity.common.constants import DEBUG_NAMESPACE, PLUGIN_DIRS_ENV_VAR
 from trinity.manager.checkpoint_converter import Converter
 from trinity.manager.state_manager import StateManager
-from trinity.perf import (
-    ExplorerPerfOptions,
-    run_explorer_perf,
-    write_explorer_perf_output,
-)
 from trinity.utils.dlc_utils import is_running, setup_ray_cluster, stop_ray_cluster
 from trinity.utils.log import get_logger
 from trinity.utils.plugin_loader import load_plugins
@@ -32,6 +28,7 @@
     pretty_exceptions_show_locals=False,
     pretty_exceptions_short=True,
 )
+app.add_typer(perf_app, name="perf")
 
 
 @dataclass(slots=True)
@@ -423,71 +420,6 @@ def studio(
     ConfigManager.run(port)
 
 
-@app.command()
-def perf(
-    config: Annotated[
-        str,
-        typer.Option("--config", "-c", help="Path to the config file."),
-    ],
-    module: Annotated[
-        str,
-        typer.Option(
-            "--module", "-m", help="Perf module to run. Currently only supports 'explorer'."
-        ),
-    ] = "explorer",
-    output_path: Annotated[
-        str,
-        typer.Option("--output-path", "-o", help="Path to the output JSON file."),
-    ] = "./perf/output.json",
-    monitor_interval: Annotated[
-        float,
-        typer.Option("--monitor-interval", help="Resource sampling interval in seconds."),
-    ] = 2.0,
-    total_steps: Annotated[
-        int,
-        typer.Option("--total-steps", help="Total steps to run the explorer for."),
-    ] = 5,
-    timeout: Annotated[
-        Optional[float],
-        typer.Option(
-            "--timeout", help="Optional timeout in seconds for prepare, sync and explore calls."
-        ),
-    ] = None,
-    plugin_dir: Annotated[
-        Optional[str],
-        typer.Option("--plugin-dir", help="Path to the directory containing plugin modules."),
-    ] = None,
-) -> None:
-    """Run performance benchmark."""
-    if module != "explorer":
-        raise typer.BadParameter("Only --module explorer is supported for now.")
-
-    try:
-        if plugin_dir:
-            os.environ[PLUGIN_DIRS_ENV_VAR] = plugin_dir
-
-        options = ExplorerPerfOptions(
-            config_path=config,
-            output_path=output_path,
-            monitor_interval=monitor_interval,
-            total_steps=total_steps,
-            timeout=timeout,
-        )
-        payload = run_explorer_perf(options)
-        write_explorer_perf_output(output_path, payload)
-    except Exception:
-        payload = {
-            "status": {
-                "success": False,
-                "error": traceback.format_exc(),
-            },
-            "data": None,
-        }
-        write_explorer_perf_output(output_path, payload)
-    if not payload["status"]["success"]:
-        typer.echo(f"Failed to run perf: {payload['status']['error']}")
-
-
 @app.command()
 def debug(
     config: Annotated[
diff --git a/trinity/cli/perf.py b/trinity/cli/perf.py
new file mode 100644
index 00000000000..bbcf0e93b37
--- /dev/null
+++ b/trinity/cli/perf.py
@@ -0,0 +1,94 @@
+import os
+import traceback
+from typing import Optional
+
+import typer
+from typing_extensions import Annotated
+
+from trinity.common.constants import PLUGIN_DIRS_ENV_VAR
+from trinity.perf import ExplorerPerfOptions, run_explorer_perf, write_explorer_perf_output
+from trinity.perf.report_viewer import launch_report_viewer
+
+
+perf_app = typer.Typer(help="Performance tooling commands.")
+
+
+@perf_app.command("run")
+def perf_run(
+    config: Annotated[
+        str,
+        typer.Option("--config", "-c", help="Path to the config file."),
+    ],
+    module: Annotated[
+        str,
+        typer.Option(
+            "--module", "-m", help="Perf module to run. Currently only supports 'explorer'."
+        ),
+    ] = "explorer",
+    output_path: Annotated[
+        str,
+        typer.Option("--output-path", "-o", help="Path to the output JSON file."),
+    ] = "./perf/output.json",
+    monitor_interval: Annotated[
+        float,
+        typer.Option("--monitor-interval", help="Resource sampling interval in seconds."),
+    ] = 2.0,
+    total_steps: Annotated[
+        int,
+        typer.Option("--total-steps", help="Total steps to run the explorer for."),
+    ] = 5,
+    timeout: Annotated[
+        Optional[float],
+        typer.Option(
+            "--timeout", help="Optional timeout in seconds for prepare, sync and explore calls."
+        ),
+    ] = None,
+    plugin_dir: Annotated[
+        Optional[str],
+        typer.Option("--plugin-dir", help="Path to the directory containing plugin modules."),
+    ] = None,
+) -> None:
+    """Run performance benchmark."""
+    if module != "explorer":
+        raise typer.BadParameter("Only --module explorer is supported for now.")
+
+    try:
+        if plugin_dir:
+            os.environ[PLUGIN_DIRS_ENV_VAR] = plugin_dir
+
+        options = ExplorerPerfOptions(
+            config_path=config,
+            output_path=output_path,
+            monitor_interval=monitor_interval,
+            total_steps=total_steps,
+            timeout=timeout,
+        )
+        payload = run_explorer_perf(options)
+        write_explorer_perf_output(output_path, payload)
+    except Exception:  # noqa: BLE001
+        payload = {
+            "status": {
+                "success": False,
+                "error": traceback.format_exc(),
+            },
+            "data": None,
+        }
+        write_explorer_perf_output(output_path, payload)
+
+    if not payload["status"]["success"]:
+        typer.echo(f"Failed to run perf: {payload['status']['error']}")
+
+
+@perf_app.command("view")
+def perf_view(
+    report: Annotated[
+        str,
+        typer.Option("--report", "-r", help="Path to the perf report JSON file."),
+    ],
+    port: Annotated[
+        int,
+        typer.Option("--port", "-p", help="Port used by the Streamlit report viewer."),
+    ] = 8503,
+) -> None:
+    """Open the Streamlit perf report viewer."""
+    launch_report_viewer(report, port)
\ No newline at end of file
diff --git a/trinity/perf/report_viewer.py b/trinity/perf/report_viewer.py
index 182f1da2440..1aa1c52f694 100644
--- a/trinity/perf/report_viewer.py
+++ b/trinity/perf/report_viewer.py
@@ -51,12 +51,18 @@ def run_viewer(report_path: str, port: int) -> None:
         sys.exit(cli.main())
 
 
+def launch_report_viewer(report_path: str, port: int) -> None:
+    """Launch the Streamlit perf report viewer from another CLI entrypoint."""
+    PerfReportViewer.run_viewer(report_path, port)
+
+
 def has_streamlit_context() -> bool:
     return get_script_run_ctx() is not None
 
 
-if has_streamlit_context():
-    st.set_page_config(page_title="Trinity Performance Report", layout="wide")
+def configure_streamlit_page() -> None:
+    if has_streamlit_context():
+        st.set_page_config(page_title="Trinity Performance Report", layout="wide")
 
 
 def parse_args() -> argparse.Namespace:
@@ -326,6 +332,7 @@ def render_resource_utilization(report: dict[str, Any]) -> None:
 
 
 def main(args: Optional[argparse.Namespace] = None) -> None:
+    configure_streamlit_page()
     if args is None:
         args = parse_args()
 
@@ -347,4 +354,4 @@ def main(args: Optional[argparse.Namespace] = None) -> None:
     if has_streamlit_context():
         main(parsed_args)
     else:
-        PerfReportViewer.run_viewer(parsed_args.report, parsed_args.port)
+        launch_report_viewer(parsed_args.report, parsed_args.port)

From 33a44e067f01b973923cb7a66bfa363874b30529 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 29 Apr 2026 10:16:07 +0800
Subject: [PATCH 10/20] fix pre-commit

---
 trinity/cli/perf.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/trinity/cli/perf.py b/trinity/cli/perf.py
index bbcf0e93b37..72d9a3c9106 100644
--- a/trinity/cli/perf.py
+++ b/trinity/cli/perf.py
@@ -6,10 +6,13 @@
 from typing_extensions import Annotated
 
 from trinity.common.constants import PLUGIN_DIRS_ENV_VAR
-from trinity.perf import ExplorerPerfOptions, run_explorer_perf, write_explorer_perf_output
+from trinity.perf import (
+    ExplorerPerfOptions,
+    run_explorer_perf,
+    write_explorer_perf_output,
+)
 from trinity.perf.report_viewer import launch_report_viewer
 
-
 perf_app = typer.Typer(help="Performance tooling commands.")
 
 
@@ -91,4 +94,4 @@ def perf_view(
     ] = 8503,
 ) -> None:
     """Open the Streamlit perf report viewer."""
-    launch_report_viewer(report, port)
\ No newline at end of file
+    launch_report_viewer(report, port)

From 78af069cacc4b1ed6e8823dfdb7f9f780afd53d8 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 29 Apr 2026 10:28:23 +0800
Subject: [PATCH 11/20] split launcher

---
 trinity/cli/convert.py  |  26 +++++++
 trinity/cli/launcher.py | 163 +++-------------------------------------
 trinity/cli/log.py      |  79 +++++++++++++++++++
 trinity/cli/perf.py     |   2 +-
 trinity/cli/studio.py   |  14 ++++
 trinity/cli/view.py     |  49 ++++++++++++
 6 files changed, 178 insertions(+), 155 deletions(-)
 create mode 100644 trinity/cli/convert.py
 create mode 100644 trinity/cli/log.py
 create mode 100644 trinity/cli/studio.py
 create mode 100644 trinity/cli/view.py

diff --git a/trinity/cli/convert.py b/trinity/cli/convert.py
new file mode 100644
index 00000000000..5fb05007bc5
--- /dev/null
+++ b/trinity/cli/convert.py
@@ -0,0 +1,26 @@
+import os
+from typing import Optional
+
+import typer
+from typing_extensions import Annotated
+
+from trinity.manager.checkpoint_converter import Converter
+
+
+def convert_command(
+    checkpoint_dir: Annotated[
+        str,
+        typer.Option("--checkpoint-dir", "-c", help="The path to the checkpoint directory."),
+    ],
+    base_model_dir: Annotated[
+        Optional[str],
+        typer.Option("--base-model-dir", "-b", help="The path to the base model."),
+    ] = None,
+) -> None:
+    """Convert model checkpoints to huggingface format."""
+    dir_path = checkpoint_dir
+    if "global_step_" in dir_path:
+        while not os.path.basename(dir_path).startswith("global_step_"):
+            dir_path = os.path.dirname(dir_path)
+    converter = Converter(base_model_dir)
+    converter.convert(dir_path)
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
index 3b8238f1520..92eb193db4b 100644
--- a/trinity/cli/launcher.py
+++ b/trinity/cli/launcher.py
@@ -12,10 +12,13 @@
 import typer
 from typing_extensions import Annotated
 
+from trinity.cli.convert import convert_command
+from trinity.cli.log import log_command
 from trinity.cli.perf import perf_app
+from trinity.cli.studio import studio_command
+from trinity.cli.view import view_command
 from trinity.common.config import Config, load_config
 from trinity.common.constants import DEBUG_NAMESPACE, PLUGIN_DIRS_ENV_VAR
-from trinity.manager.checkpoint_converter import Converter
 from trinity.manager.state_manager import StateManager
 from trinity.utils.dlc_utils import is_running, setup_ray_cluster, stop_ray_cluster
 from trinity.utils.log import get_logger
@@ -28,7 +31,6 @@
     pretty_exceptions_show_locals=False,
     pretty_exceptions_short=True,
 )
-app.add_typer(perf_app, name="perf")
 
 
 @dataclass(slots=True)
@@ -407,19 +409,6 @@ def run(
             stop_ray_cluster(namespace=cluster_namespace)
 
 
-@app.command()
-def studio(
-    port: Annotated[
-        int,
-        typer.Option("--port", "-p", help="The port for Trinity-Studio."),
-    ] = 8501,
-) -> None:
-    """Run studio to manage configurations."""
-    from trinity.manager.config_manager import ConfigManager
-
-    ConfigManager.run(port)
-
-
 @app.command()
 def debug(
     config: Annotated[
@@ -499,145 +488,11 @@ def debug(
         )
 
 
-@app.command()
-def view(
-    url: Annotated[
-        str,
-        typer.Option(
-            "--url",
-            help="Database URL for the experience table, for example sqlite:////path/to/debug_buffer.db.",
-        ),
-    ],
-    table: Annotated[
-        str,
-        typer.Option("--table", help="Name of the experience table to monitor."),
-    ],
-    tokenizer: Annotated[
-        str,
-        typer.Option(
-            "--tokenizer",
-            help="Tokenizer/model path used to decode token ids in the viewer.",
-        ),
-    ],
-    schema: Annotated[
-        str,
-        typer.Option(
-            "--schema",
-            help="Schema type of the table. Supported values: experience, sft.",
-        ),
-    ] = "experience",
-    port: Annotated[
-        int,
-        typer.Option("--port", "-p", help="The port for Experience Viewer."),
-    ] = 8502,
-) -> None:
-    """Run the Streamlit viewer to inspect an experience table."""
-    from trinity.buffer.viewer import SQLExperienceViewer
-
-    schema = schema.lower()
-    if schema not in {"experience", "sft"}:
-        raise typer.BadParameter("--schema only supports 'experience' or 'sft'.")
-
-    SQLExperienceViewer.run_viewer(
-        model_path=tokenizer,
-        db_url=url,
-        table_name=table,
-        schema_type=schema,
-        port=port,
-    )
-
-
-@app.command()
-def convert(
-    checkpoint_dir: Annotated[
-        str,
-        typer.Option("--checkpoint-dir", "-c", help="The path to the checkpoint directory."),
-    ],
-    base_model_dir: Annotated[
-        Optional[str],
-        typer.Option("--base-model-dir", "-b", help="The path to the base model."),
-    ] = None,
-) -> None:
-    """Convert checkpoints to huggingface format."""
-    dir_path = checkpoint_dir
-    if "global_step_" in dir_path:
-        while not os.path.basename(dir_path).startswith("global_step_"):
-            dir_path = os.path.dirname(dir_path)
-    converter = Converter(base_model_dir)
-    converter.convert(dir_path)
-
-
-@app.command()
-def log(
-    log_dir: Annotated[
-        str,
-        typer.Option(
-            "--log-dir",
-            "-d",
-            help="Path to the log directory. If provided, it will be used directly and ignore --config.",
-        ),
-    ] = "",
-    config: Annotated[
-        str,
-        typer.Option(
-            "--config",
-            "-c",
-            help="Path to the config file. If provided, it will automatically locate the log directory based on the config.",
-        ),
-    ] = "",
-    keyword: Annotated[
-        Optional[str],
-        typer.Option(
-            "--keyword",
-            "-k",
-            help="Only track log files containing the keyword in their filenames.",
-        ),
-    ] = None,
-    level: Annotated[
-        str,
-        typer.Option("--level", "-l", help="The minimum log level to display in real-time."),
-    ] = "INFO",
-    last_n_lines: Annotated[
-        int,
-        typer.Option("--last-n-lines", "-n", help="Number of last lines to display when starting."),
-    ] = 0,
-    search_pattern: Annotated[
-        Optional[str],
-        typer.Option(
-            "--search-pattern",
-            "-p",
-            help="The pattern to search in log files. Only search for history logs and display all lines containing the pattern.",
-        ),
-    ] = None,
-    no_color: Annotated[
-        bool,
-        typer.Option("--no-color", help="Disable colored output."),
-    ] = False,
-) -> None:
-    """Monitor log files in real-time."""
-    from trinity.manager.log_manager import LogManager
-
-    if not config and not log_dir:
-        raise typer.BadParameter("Either --config or --log-dir must be provided.")
-    if not log_dir:
-        cfg = load_config(config)
-        checkpoint_job_dir = cfg.get_checkpoint_job_dir()
-        # we do not use check_and_update here because user may use this command
-        # in another environment
-        log_dir = os.path.join(checkpoint_job_dir, "log")
-
-    if not os.path.exists(log_dir):
-        raise FileNotFoundError(f"Log directory not found: {log_dir}")
-
-    log_manager = LogManager(
-        log_dir=log_dir,
-        keyword=keyword,
-        min_level=level,
-        color_output=not no_color,
-        last_n_lines=last_n_lines,
-        search_pattern=search_pattern,
-    )
-    log_manager.monitor()
+app.command("studio")(studio_command)
+app.add_typer(perf_app, name="perf")
+app.command("view")(view_command)
+app.command("convert")(convert_command)
+app.command("log")(log_command)
 
 
 def main() -> None:
diff --git a/trinity/cli/log.py b/trinity/cli/log.py
new file mode 100644
index 00000000000..40dc5d9301f
--- /dev/null
+++ b/trinity/cli/log.py
@@ -0,0 +1,79 @@
+import os
+from typing import Optional
+
+import typer
+from typing_extensions import Annotated
+
+from trinity.common.config import load_config
+
+
+def log_command(
+    log_dir: Annotated[
+        str,
+        typer.Option(
+            "--log-dir",
+            "-d",
+            help="Path to the log directory. If provided, it will be used directly and ignore --config.",
+        ),
+    ] = "",
+    config: Annotated[
+        str,
+        typer.Option(
+            "--config",
+            "-c",
+            help="Path to the config file. If provided, it will automatically locate the log directory based on the config.",
+        ),
+    ] = "",
+    keyword: Annotated[
+        Optional[str],
+        typer.Option(
+            "--keyword",
+            "-k",
+            help="Only track log files containing the keyword in their filenames.",
+        ),
+    ] = None,
+    level: Annotated[
+        str,
+        typer.Option("--level", "-l", help="The minimum log level to display in real-time."),
+    ] = "INFO",
+    last_n_lines: Annotated[
+        int,
+        typer.Option("--last-n-lines", "-n", help="Number of last lines to display when starting."),
+    ] = 0,
+    search_pattern: Annotated[
+        Optional[str],
+        typer.Option(
+            "--search-pattern",
+            "-p",
+            help="The pattern to search in log files. Only search for history logs and display all lines containing the pattern.",
+        ),
+    ] = None,
+    no_color: Annotated[
+        bool,
+        typer.Option("--no-color", help="Disable colored output."),
+    ] = False,
+) -> None:
+    """Monitor log files in real-time."""
+    from trinity.manager.log_manager import LogManager
+
+    if not config and not log_dir:
+        raise typer.BadParameter("Either --config or --log-dir must be provided.")
+    if not log_dir:
+        cfg = load_config(config)
+        checkpoint_job_dir = cfg.get_checkpoint_job_dir()
+        # we do not use check_and_update here because user may use this command
+        # in another environment
+        log_dir = os.path.join(checkpoint_job_dir, "log")
+
+    if not os.path.exists(log_dir):
+        raise FileNotFoundError(f"Log directory not found: {log_dir}")
+
+    log_manager = LogManager(
+        log_dir=log_dir,
+        keyword=keyword,
+        min_level=level,
+        color_output=not no_color,
+        last_n_lines=last_n_lines,
+        search_pattern=search_pattern,
+    )
+    log_manager.monitor()
diff --git a/trinity/cli/perf.py b/trinity/cli/perf.py
index 72d9a3c9106..e7e3b85dc69 100644
--- a/trinity/cli/perf.py
+++ b/trinity/cli/perf.py
@@ -13,7 +13,7 @@
 )
 from trinity.perf.report_viewer import launch_report_viewer
 
-perf_app = typer.Typer(help="Performance tooling commands.")
+perf_app = typer.Typer(help="Performance testing tools.")
 
 
 @perf_app.command("run")
diff --git a/trinity/cli/studio.py b/trinity/cli/studio.py
new file mode 100644
index 00000000000..e9ca39a2aef
--- /dev/null
+++ b/trinity/cli/studio.py
@@ -0,0 +1,14 @@
+import typer
+from typing_extensions import Annotated
+
+
+def studio_command(
+    port: Annotated[
+        int,
+        typer.Option("--port", "-p", help="The port for Trinity-Studio."),
+    ] = 8501,
+) -> None:
+    """Run studio to manage configurations."""
+    from trinity.manager.config_manager import ConfigManager
+
+    ConfigManager.run(port)
diff --git a/trinity/cli/view.py b/trinity/cli/view.py
new file mode 100644
index 00000000000..05dc537d317
--- /dev/null
+++ b/trinity/cli/view.py
@@ -0,0 +1,49 @@
+import typer
+from typing_extensions import Annotated
+
+
+def view_command(
+    url: Annotated[
+        str,
+        typer.Option(
+            "--url",
+            help="Database URL for the experience table, for example sqlite:////path/to/debug_buffer.db.",
+        ),
+    ],
+    table: Annotated[
+        str,
+        typer.Option("--table", help="Name of the experience table to monitor."),
+    ],
+    tokenizer: Annotated[
+        str,
+        typer.Option(
+            "--tokenizer",
+            help="Tokenizer/model path used to decode token ids in the viewer.",
+        ),
+    ],
+    schema: Annotated[
+        str,
+        typer.Option(
+            "--schema",
+            help="Schema type of the table. Supported values: experience, sft.",
+        ),
+    ] = "experience",
+    port: Annotated[
+        int,
+        typer.Option("--port", "-p", help="The port for Experience Viewer."),
+    ] = 8502,
+) -> None:
+    """Run the Streamlit viewer to inspect an experience table."""
+    from trinity.buffer.viewer import SQLExperienceViewer
+
+    schema = schema.lower()
+    if schema not in {"experience", "sft"}:
+        raise typer.BadParameter("--schema only supports 'experience' or 'sft'.")
+
+    SQLExperienceViewer.run_viewer(
+        model_path=tokenizer,
+        db_url=url,
+        table_name=table,
+        schema_type=schema,
+        port=port,
+    )

From 03280231993d9c742be5b015280d4578bd35b946 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 29 Apr 2026 14:09:48 +0800
Subject: [PATCH 12/20] support vllm 0.20.0

---
 .../source/tutorial/trinity_installation.md   |  2 +-
 .../tutorial/trinity_installation.md          |  2 +-
 pyproject.toml                                |  7 +--
 scripts/docker/Dockerfile                     | 40 ---------------
 scripts/docker/Dockerfile.megatron            | 50 -------------------
 scripts/docker/Dockerfile.uv                  |  6 +--
 tests/trainer/trainer_test.py                 |  5 +-
 trinity/common/models/vllm_patch/__init__.py  |  2 +-
 .../common/models/vllm_patch/worker_patch.py  |  4 +-
 trinity/explorer/explorer.py                  |  2 +-
 10 files changed, 14 insertions(+), 106 deletions(-)
 delete mode 100644 scripts/docker/Dockerfile
 delete mode 100644 scripts/docker/Dockerfile.megatron

diff --git a/docs/sphinx_doc/source/tutorial/trinity_installation.md b/docs/sphinx_doc/source/tutorial/trinity_installation.md
index a1abd379f9d..1a8f647ff0e 100644
--- a/docs/sphinx_doc/source/tutorial/trinity_installation.md
+++ b/docs/sphinx_doc/source/tutorial/trinity_installation.md
@@ -126,7 +126,7 @@ cd Trinity-RFT
 
 # Build the Docker image
 ## Tip: You can modify the Dockerfile to add mirrors or set API keys
-docker build -f scripts/docker/Dockerfile -t trinity-rft:latest .
+docker build -f scripts/docker/Dockerfile.uv -t trinity-rft:latest .
 
 # Run the container, replacing <path_to_your_data_and_checkpoints> with your actual path
 docker run -it \
diff --git a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md
index f3eed4cadd8..a50d20b70f3 100644
--- a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md
+++ b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md
@@ -127,7 +127,7 @@ cd Trinity-RFT
 
 # 构建 Docker 镜像
 ## 提示：可根据需要修改 Dockerfile 添加镜像源或设置 API 密钥
-docker build -f scripts/docker/Dockerfile -t trinity-rft:latest .
+docker build -f scripts/docker/Dockerfile.uv -t trinity-rft:latest .
 
 # 运行容器，请将 <path_to_your_data_and_checkpoints> 替换为实际需要挂载的路径
 docker run -it \
diff --git a/pyproject.toml b/pyproject.toml
index d7b7d71d983..29fcb82e373 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ dependencies = [
     "matplotlib",
     "psutil",
     "nvidia-ml-py",
-    "transformers>=5.5.3",
+    "transformers>=5.6.2",
     "datasets>=4.0.0",
     "typer>=0.20.1",
 ]
@@ -54,9 +54,7 @@ trinity = "trinity.cli.launcher:main"
 
 [project.optional-dependencies]
 vllm = [
-    "vllm>=0.17.0,<=0.19.1",
-    # For v0.17 to v0.19, the default dependencies require transformers < 5.
-    # We have patched vLLM to support transformers >= 5.0.0.
+    "vllm>=0.19.1,<=0.20.0",
 ]
 data = [
     "py-data-juicer>=1.4.3"
@@ -106,7 +104,6 @@ doc = [
 
 mm = [
     "qwen-vl-utils",
-    "transformers>=4.54.0",
     "blobfile",
 ]
 
diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile
deleted file mode 100644
index ab218222891..00000000000
--- a/scripts/docker/Dockerfile
+++ /dev/null
@@ -1,40 +0,0 @@
-# This Dockerfile sets up a Trinity-RFT environment with minimal support.
-# Build and run the docker image with the following command:
-#
-# cd <Trinity-RFT root dir>
-# docker build -f scripts/docker/Dockerfile -t trinity-rft:latest .
-# docker run -it --gpus all --shm-size="64g" --rm -v $PWD:/workspace -v <root_path_of_data_and_checkpoints>:/data trinity-rft:latest
-
-
-FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
-
-WORKDIR /workspace
-
-RUN chmod 1777 /tmp && apt update && apt install -y \
-    build-essential \
-    curl git wget vim tmux net-tools \
-    python3 python3-pip python3-dev python3-packaging \
-    libomp-dev infiniband-diags libibverbs-dev librdmacm-dev rdma-core perftest \
-    && rm -rf /var/lib/apt/lists/* \
-    && ln -sf /usr/bin/python3 /usr/bin/python \
-    && ln -sf /usr/bin/pip3 /usr/bin/pip
-
-
-# For Aliyun users: update pip mirror to aliyun to speed up pip install
-# RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
-#    && pip config set install.trusted-host mirrors.cloud.aliyuncs.com
-
-# copy the Trinity-RFT dir into the workspace
-COPY . .
-
-RUN pip install --upgrade pip && pip install -e .[vllm,mm,dev] && pip install flash_attn==2.8.1 --no-build-isolation
-
-# Set Env variables
-
-# WANDB
-# ENV WANDB_API_KEY=
-# ENV WANDB_BASE_URL=
-
-# LLM API
-# ENV OPENAI_API_KEY=
-# ENV DASH_API_KEY=
diff --git a/scripts/docker/Dockerfile.megatron b/scripts/docker/Dockerfile.megatron
deleted file mode 100644
index 681dd1c9f5e..00000000000
--- a/scripts/docker/Dockerfile.megatron
+++ /dev/null
@@ -1,50 +0,0 @@
-# This Dockerfile sets up a Trinity-RFT environment with Megatron-LM support.
-# Build and run the docker image with the following command:
-#
-# cd <Trinity-RFT root dir>
-# docker build -f scripts/docker/Dockerfile.megatron -t trinity-rft-megatron:latest .
-# docker run -it --gpus all --shm-size="64g" --rm -v $PWD:/workspace -v <root_path_of_data_and_checkpoints>:/data trinity-rft-megatron:latest
-
-
-FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
-
-WORKDIR /workspace
-
-RUN chmod 1777 /tmp && apt update && apt install -y \
-    build-essential \
-    curl git wget vim tmux net-tools \
-    python3 python3-pip python3-dev python3-packaging \
-    libomp-dev infiniband-diags libibverbs-dev librdmacm-dev rdma-core perftest \
-    && rm -rf /var/lib/apt/lists/* \
-    && ln -sf /usr/bin/python3 /usr/bin/python \
-    && ln -sf /usr/bin/pip3 /usr/bin/pip
-
-# For Aliyun users: update pip mirror to aliyun to speed up pip install
-# RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
-#    && pip config set install.trusted-host mirrors.cloud.aliyuncs.com
-
-# copy the Trinity-RFT dir into the workspace
-COPY . .
-
-# Install Trinity-RFT with Megatron
-RUN pip install --upgrade pip \
-    && pip install -e .[vllm,mm,dev] \
-    && pip install flash_attn==2.8.1 --no-build-isolation \
-    && pip install -e .[megatron] \
-    && pip install transformer_engine[pytorch]==2.10.0 --no-build-isolation --no-cache-dir \
-    && pip install git+https://github.com/ISEEKYAN/mbridge.git@20e9ffbbe72ae7b1df83bfe1bc3c11f7382f2612 \
-    && NVCC_APPEND_FLAGS="--threads 4" APEX_PARALLEL_BUILD=8 pip install -v \
-        --disable-pip-version-check --no-cache-dir --no-build-isolation \
-        --config-settings "--build-option=--cpp_ext" \
-        --config-settings "--build-option=--cuda_ext" \
-        --resume-retries 20 git+https://github.com/NVIDIA/apex.git
-
-# Set Env variables
-
-# WANDB
-# ENV WANDB_API_KEY=
-# ENV WANDB_BASE_URL=
-
-# LLM API
-# ENV OPENAI_API_KEY=
-# ENV DASH_API_KEY=
diff --git a/scripts/docker/Dockerfile.uv b/scripts/docker/Dockerfile.uv
index 9af1071e065..5bb63f00794 100644
--- a/scripts/docker/Dockerfile.uv
+++ b/scripts/docker/Dockerfile.uv
@@ -10,7 +10,7 @@
 # 2. The uv virtual environment is created at `/opt/venv`, use `source /opt/venv/bin/activate` to activate it.
 # 3. Make sure to use `uv pip` to install packages within the virtual environment.
 
-FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
+FROM nvcr.io/nvidia/cuda:13.0.1-cudnn-devel-ubuntu22.04
 
 WORKDIR /workspace
 
@@ -37,11 +37,11 @@ RUN pip install uv && uv venv /opt/venv --python=python3.12
 # Install  Trinity-RFT
 RUN . /opt/venv/bin/activate && \
     uv pip install -e.[mm,dev,tinker,data,agent] && \
-    uv pip install vllm==0.19.1 && \
+    uv pip install vllm==0.20.0 && \
     uv pip install flash_attn==2.8.3 --no-build-isolation && \
     uv pip install -e .[megatron,qwen3_5] --no-build-isolation && \
     uv pip install git+https://github.com/ISEEKYAN/mbridge.git@90c4633a6cdcfe5d29723d7b145d32f6f5e73303 && \
-    uv pip install transformers==5.5.4 && \
+    uv pip install transformers==5.7.0 && \
     NVCC_APPEND_FLAGS="--threads 4" APEX_PARALLEL_BUILD=8 \
     uv pip install -v --no-build-isolation \
     --config-settings="--build-option=--cpp_ext" \
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
index 2a71a965b08..739b64c8169 100644
--- a/tests/trainer/trainer_test.py
+++ b/tests/trainer/trainer_test.py
@@ -31,7 +31,8 @@
     get_vision_language_model_path,
 )
 from trinity.buffer import get_buffer_reader
-from trinity.cli.launcher import bench, both, convert, explore, run, serve, train
+from trinity.cli.convert import convert_command
+from trinity.cli.launcher import bench, both, explore, run, serve, train
 from trinity.common.config import (
     AlgorithmConfig,
     BufferConfig,
@@ -149,7 +150,7 @@ def test_trainer(self):
         self.assertGreater(len(hf_dir_step_4), 0)
         self.assertGreater(len(hf_dir_step_8), 0)
         # test checkpoint convert
-        convert(self.config.checkpoint_job_dir)
+        convert_command(self.config.checkpoint_job_dir)
         hf_dir_step_4 = os.listdir(os.path.join(checkpoint_step_4, "actor", "huggingface"))
         hf_dir_step_8 = os.listdir(os.path.join(checkpoint_step_8, "actor", "huggingface"))
         self.assertIn("model.safetensors", hf_dir_step_4)
diff --git a/trinity/common/models/vllm_patch/__init__.py b/trinity/common/models/vllm_patch/__init__.py
index b71996b7395..fd9b895c8d0 100644
--- a/trinity/common/models/vllm_patch/__init__.py
+++ b/trinity/common/models/vllm_patch/__init__.py
@@ -9,7 +9,7 @@
 
 VLLM_VERSION_0120 = parse_version("0.12.0")
 VLLM_VERSION_0170 = parse_version("0.17.0")
-VLLM_VERSION_0191 = parse_version("0.19.1")
+VLLM_VERSION_0191 = parse_version("0.20.0")
 
 
 def vllm_patch():
diff --git a/trinity/common/models/vllm_patch/worker_patch.py b/trinity/common/models/vllm_patch/worker_patch.py
index 6975d5f954f..50dc163feb1 100644
--- a/trinity/common/models/vllm_patch/worker_patch.py
+++ b/trinity/common/models/vllm_patch/worker_patch.py
@@ -13,10 +13,10 @@
 def patch_vllm_prompt_logprobs(model_runner: GPUModelRunner):  # noqa: C901
     """Patch vLLM model runner to support prompt logprobs extraction."""
     version = get_vllm_version()
-    if version < parse_version("0.10.2") or version > parse_version("0.19.1"):
+    if version < parse_version("0.10.2") or version > parse_version("0.20.0"):
         raise ValueError(
             f"Unsupported vllm version: {vllm.__version__}. "
-            "This patch requires vllm version >= 0.10.2, <= 0.19.1."
+            "This patch requires vllm version >= 0.10.2, <= 0.20.0."
         )
     is_v0102 = version == parse_version("0.10.2")
 
diff --git a/trinity/explorer/explorer.py b/trinity/explorer/explorer.py
index fcf5c465903..30466b67387 100644
--- a/trinity/explorer/explorer.py
+++ b/trinity/explorer/explorer.py
@@ -405,7 +405,7 @@ async def _finish_explore_step(self, step: int, model_version: int) -> None:
         if self.taskset is not None:
             self.taskset.feedback(result["metrics"])
         metric.update(result["metrics"])
-        if result["rollout/finished_task_count"] > 0 and self.monitor is not None:
+        if result["finished_task_count"] > 0 and self.monitor is not None:
             self.monitor.log(metric, step=step)
 
     async def _finish_eval_step(self, step: Optional[int] = None, prefix: str = "eval") -> None:

From 591d9438b35f9361c08ace3ecc2571e6cf197487 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 29 Apr 2026 14:13:54 +0800
Subject: [PATCH 13/20] fix doc

---
 docs/sphinx_doc/source/tutorial/example_megatron.md    | 5 +++--
 docs/sphinx_doc/source_zh/tutorial/example_megatron.md | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/sphinx_doc/source/tutorial/example_megatron.md b/docs/sphinx_doc/source/tutorial/example_megatron.md
index b8d0831d2af..e852cfd8f33 100644
--- a/docs/sphinx_doc/source/tutorial/example_megatron.md
+++ b/docs/sphinx_doc/source/tutorial/example_megatron.md
@@ -39,10 +39,10 @@ We provide a Docker setup to simplify environment management.
 #### Build the Docker Image
 
 
-Trinity-RFT provides a dedicated Dockerfile for Megatron-LM located at `scripts/docker/Dockerfile.megatron`. You can build the image using the following command:
+Trinity-RFT's provided Docker already has Megatron-LM related dependencies pre-installed. You can either use our provided Docker image directly or customize the Dockerfile to build your own image as needed.
 
 ```bash
-docker build -f scripts/docker/Dockerfile.megatron -t trinity-rft-megatron:latest .
+docker build -f scripts/docker/Dockerfile.uv -t trinity-rft-megatron:latest .
 ```
 
 > 💡 You can customize the Dockerfile before building — for example, to add pip mirrors or set API keys.
@@ -60,6 +60,7 @@ docker run -it \
 ```
 
 Replace `<your_data_and_checkpoints_path>` with the actual path on your machine where datasets and model checkpoints are stored.
+The image uses `uv` to manage Python dependencies, the virtual environment will be automatically activated after entering the docker container (you can also manually activate it with `source /opt/venv/bin/activate`). The image has include dependencies such as vllm, flash-attn and Megatron-LM, if you need to download more packages, don't forget to activate the virtual environment and use `uv pip install` to install them.
 
 ---
 
diff --git a/docs/sphinx_doc/source_zh/tutorial/example_megatron.md b/docs/sphinx_doc/source_zh/tutorial/example_megatron.md
index f0eaa632791..fc671ba5bfb 100644
--- a/docs/sphinx_doc/source_zh/tutorial/example_megatron.md
+++ b/docs/sphinx_doc/source_zh/tutorial/example_megatron.md
@@ -44,11 +44,10 @@ pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
 
 #### 构建 Docker 镜像
 
-Trinity-RFT 提供了专门用于 Megatron-LM 的 Dockerfile，位于 `scripts/docker/Dockerfile.megatron`。
-可以使用以下命令构建镜像：
+Trinity-RFT 提供的 Docker 已经预装了 Megatron-LM 相关依赖。你可以直接使用我们提供的 Docker 镜像，或者根据需要自定义 Dockerfile 来构建镜像。
 
 ```bash
-docker build -f scripts/docker/Dockerfile.megatron -t trinity-rft-megatron:latest .
+docker build -f scripts/docker/Dockerfile.uv -t trinity-rft:latest .
 ```
 
 > 💡 你可以在构建前自定义 Dockerfile —— 例如添加 pip 镜像源或设置 API 密钥。
@@ -62,10 +61,11 @@ docker run -it \
     --rm \
     -v $PWD:/workspace \
     -v <your_data_and_checkpoints_path>:/data \
-    trinity-rft-megatron:latest
+    trinity-rft:latest
 ```
 
 请将 `<your_data_and_checkpoints_path>` 替换为你机器上存储数据集和模型检查点的实际路径。
+该镜像使用 `uv` 来管理 Python 依赖，进入容器后虚拟环境会自动激活（也可通过 `source /opt/venv/bin/activate` 手动激活）。该镜像已经包含了 vllm, flash-attn 以及 Megatron-LM，如果需要使用其他依赖，可直接使用 `uv pip install` 来安装它们。
 
 ---
 

From 1836a3d0f5cad9900697059249cb21eee89f97b2 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 29 Apr 2026 21:30:53 +0800
Subject: [PATCH 14/20] fix docker file

---
 .github/workflows/docker/docker-compose.yaml |  4 +--
 pyproject.toml                               |  2 +-
 scripts/docker/Dockerfile.uv                 | 30 ++++++++++++++++----
 trinity/cli/convert.py                       |  4 +--
 trinity/cli/launcher.py                      |  2 +-
 trinity/cli/perf.py                          | 14 +++++----
 6 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
index 0ea1026f3f0..b9be76568f1 100644
--- a/.github/workflows/docker/docker-compose.yaml
+++ b/.github/workflows/docker/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   trinity-node-1:
-    image: trinity-rft-unittest:20260420
+    image: trinity-rft-unittest:20260429
     cap_add:
       - SYS_PTRACE
     pull_policy: never
@@ -34,7 +34,7 @@ services:
             capabilities: [gpu]
 
   trinity-node-2:
-    image: trinity-rft-unittest:20260420
+    image: trinity-rft-unittest:20260429
     cap_add:
       - SYS_PTRACE
     pull_policy: never
diff --git a/pyproject.toml b/pyproject.toml
index 29fcb82e373..5a519664ffb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,7 +81,7 @@ megatron = [
     "megatron-core[mlm]==0.16.1",
     # if you found "undefined symbol" error in transformer engine
     # reinstall it with --no-build-isolation and `--no-cache-dir` flag
-    "transformer_engine[pytorch]==2.13.0",
+    "transformer-engine[pytorch]==2.13.0",
 
     # Install mbridge from main branch (unreleased version)
     # "mbridge @ git+https://github.com/ISEEKYAN/mbridge.git@90c4633a6cdcfe5d29723d7b145d32f6f5e73303",
diff --git a/scripts/docker/Dockerfile.uv b/scripts/docker/Dockerfile.uv
index 5bb63f00794..4dc0f857ae3 100644
--- a/scripts/docker/Dockerfile.uv
+++ b/scripts/docker/Dockerfile.uv
@@ -19,11 +19,15 @@ RUN chmod 1777 /tmp && apt update && apt install -y \
     curl git wget vim tmux net-tools \
     python3 python3-pip python3-dev python3-packaging python3-venv \
     libomp-dev libnuma1 infiniband-diags libibverbs-dev librdmacm-dev rdma-core perftest \
+    libnuma-dev \
     && rm -rf /var/lib/apt/lists/* \
     && ln -sf /usr/bin/python3 /usr/bin/python \
     && ln -sf /usr/bin/pip3 /usr/bin/pip
 
 ENV VIRTUAL_ENV=/opt/venv
+ARG BUILD_JOBS=16
+ARG NVTE_BUILD_THREADS_PER_JOB=2
+ARG NVCC_THREADS=8
 
 # copy the Trinity-RFT dir into the workspace
 COPY . .
@@ -37,18 +41,32 @@ RUN pip install uv && uv venv /opt/venv --python=python3.12
 # Install  Trinity-RFT
 RUN . /opt/venv/bin/activate && \
     uv pip install -e.[mm,dev,tinker,data,agent] && \
-    uv pip install vllm==0.20.0 && \
-    uv pip install flash_attn==2.8.3 --no-build-isolation && \
-    uv pip install -e .[megatron,qwen3_5] --no-build-isolation && \
-    uv pip install git+https://github.com/ISEEKYAN/mbridge.git@90c4633a6cdcfe5d29723d7b145d32f6f5e73303 && \
-    uv pip install transformers==5.7.0 && \
-    NVCC_APPEND_FLAGS="--threads 4" APEX_PARALLEL_BUILD=8 \
+    uv pip install vllm==0.20.0
+
+# Install flash attention
+RUN . /opt/venv/bin/activate && \
+    MAX_JOBS=${BUILD_JOBS} \
+    uv pip install flash_attn==2.8.3 --no-build-isolation
+
+# Install Megatron
+RUN . /opt/venv/bin/activate && MAX_JOBS=${BUILD_JOBS} \
+    CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} \
+    NVTE_BUILD_THREADS_PER_JOB=${NVTE_BUILD_THREADS_PER_JOB} \
+    uv pip install -e .[qwen3_5] --no-build-isolation && \
+    uv pip install megatron-core[mlm]==0.16.1 --no-build-isolation && \
+    uv pip install transformer-engine[pytorch]==2.14.0 && \
+    uv pip install transformer-engine[pytorch]==2.14.0 --no-build-isolation --no-cache-dir
+    # the above line is necessary, otherwise you may encounter "undefined symbol" error when importing transformer engine, so do not remove it even if it looks redundant
+
+RUN . /opt/venv/bin/activate && uv pip install git+https://github.com/ISEEKYAN/mbridge.git@90c4633a6cdcfe5d29723d7b145d32f6f5e73303 && \
+    NVCC_APPEND_FLAGS="--threads ${NVCC_THREADS}" APEX_PARALLEL_BUILD=${BUILD_JOBS} \
     uv pip install -v --no-build-isolation \
     --config-settings="--build-option=--cpp_ext" \
     --config-settings="--build-option=--cuda_ext" \
     git+https://github.com/NVIDIA/apex.git
 
 # Set Env variables
+# ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64
 
 # WANDB
 # ENV WANDB_API_KEY=
diff --git a/trinity/cli/convert.py b/trinity/cli/convert.py
index 5fb05007bc5..085df5f17ae 100644
--- a/trinity/cli/convert.py
+++ b/trinity/cli/convert.py
@@ -4,8 +4,6 @@
 import typer
 from typing_extensions import Annotated
 
-from trinity.manager.checkpoint_converter import Converter
-
 
 def convert_command(
     checkpoint_dir: Annotated[
@@ -18,6 +16,8 @@ def convert_command(
     ] = None,
 ) -> None:
     """Convert model checkpoints to huggingface format."""
+    from trinity.manager.checkpoint_converter import Converter
+
     dir_path = checkpoint_dir
     if "global_step_" in dir_path:
         while not os.path.basename(dir_path).startswith("global_step_"):
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
index 92eb193db4b..3bf6ca7e0e9 100644
--- a/trinity/cli/launcher.py
+++ b/trinity/cli/launcher.py
@@ -19,7 +19,6 @@
 from trinity.cli.view import view_command
 from trinity.common.config import Config, load_config
 from trinity.common.constants import DEBUG_NAMESPACE, PLUGIN_DIRS_ENV_VAR
-from trinity.manager.state_manager import StateManager
 from trinity.utils.dlc_utils import is_running, setup_ray_cluster, stop_ray_cluster
 from trinity.utils.log import get_logger
 from trinity.utils.plugin_loader import load_plugins
@@ -368,6 +367,7 @@ def run(
 
     try:
         if cfg.stages:
+            from trinity.manager.state_manager import StateManager
             from trinity.trainer.verl.utils import get_latest_hf_checkpoint_path
 
             state_manager = StateManager(
diff --git a/trinity/cli/perf.py b/trinity/cli/perf.py
index e7e3b85dc69..4c3383ace16 100644
--- a/trinity/cli/perf.py
+++ b/trinity/cli/perf.py
@@ -6,12 +6,6 @@
 from typing_extensions import Annotated
 
 from trinity.common.constants import PLUGIN_DIRS_ENV_VAR
-from trinity.perf import (
-    ExplorerPerfOptions,
-    run_explorer_perf,
-    write_explorer_perf_output,
-)
-from trinity.perf.report_viewer import launch_report_viewer
 
 perf_app = typer.Typer(help="Performance testing tools.")
 
@@ -55,6 +49,12 @@ def perf_run(
     if module != "explorer":
         raise typer.BadParameter("Only --module explorer is supported for now.")
 
+    from trinity.perf import (
+        ExplorerPerfOptions,
+        run_explorer_perf,
+        write_explorer_perf_output,
+    )
+
     try:
         if plugin_dir:
             os.environ[PLUGIN_DIRS_ENV_VAR] = plugin_dir
@@ -94,4 +94,6 @@ def perf_view(
     ] = 8503,
 ) -> None:
     """Open the Streamlit perf report viewer."""
+    from trinity.perf.report_viewer import launch_report_viewer
+
     launch_report_viewer(report, port)

From 9c90f8dc62253190d8b64d37e196e9aa2683f083 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Thu, 30 Apr 2026 16:45:33 +0800
Subject: [PATCH 15/20] fix dockerfile

---
 .github/workflows/docker/docker-compose.yaml |  4 ++--
 pyproject.toml                               |  5 ++---
 scripts/docker/Dockerfile.uv                 | 20 +++++++++++++-------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
index b9be76568f1..3f3fcf4f21b 100644
--- a/.github/workflows/docker/docker-compose.yaml
+++ b/.github/workflows/docker/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   trinity-node-1:
-    image: trinity-rft-unittest:20260429
+    image: trinity-rft-unittest:20260501
     cap_add:
       - SYS_PTRACE
     pull_policy: never
@@ -34,7 +34,7 @@ services:
             capabilities: [gpu]
 
   trinity-node-2:
-    image: trinity-rft-unittest:20260429
+    image: trinity-rft-unittest:20260501
     cap_add:
       - SYS_PTRACE
     pull_policy: never
diff --git a/pyproject.toml b/pyproject.toml
index 5a519664ffb..733f44dc570 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,9 +79,8 @@ dev = [
 ]
 megatron = [
     "megatron-core[mlm]==0.16.1",
-    # if you found "undefined symbol" error in transformer engine
-    # reinstall it with --no-build-isolation and `--no-cache-dir` flag
-    "transformer-engine[pytorch]==2.13.0",
+    # Please install transformer-engine from source, do not install from pip
+    # "transformer-engine[pytorch]==2.14.1",
 
     # Install mbridge from main branch (unreleased version)
     # "mbridge @ git+https://github.com/ISEEKYAN/mbridge.git@90c4633a6cdcfe5d29723d7b145d32f6f5e73303",
diff --git a/scripts/docker/Dockerfile.uv b/scripts/docker/Dockerfile.uv
index 4dc0f857ae3..4da21404e32 100644
--- a/scripts/docker/Dockerfile.uv
+++ b/scripts/docker/Dockerfile.uv
@@ -16,7 +16,7 @@ WORKDIR /workspace
 
 RUN chmod 1777 /tmp && apt update && apt install -y \
     build-essential \
-    curl git wget vim tmux net-tools \
+    curl git wget vim tmux net-tools cmake \
     python3 python3-pip python3-dev python3-packaging python3-venv \
     libomp-dev libnuma1 infiniband-diags libibverbs-dev librdmacm-dev rdma-core perftest \
     libnuma-dev \
@@ -25,7 +25,7 @@ RUN chmod 1777 /tmp && apt update && apt install -y \
     && ln -sf /usr/bin/pip3 /usr/bin/pip
 
 ENV VIRTUAL_ENV=/opt/venv
-ARG BUILD_JOBS=16
+ARG BUILD_JOBS=32
 ARG NVTE_BUILD_THREADS_PER_JOB=2
 ARG NVCC_THREADS=8
 
@@ -48,17 +48,23 @@ RUN . /opt/venv/bin/activate && \
     MAX_JOBS=${BUILD_JOBS} \
     uv pip install flash_attn==2.8.3 --no-build-isolation
 
+# Install Transformer Engine
+RUN . /opt/venv/bin/activate && \
+    git clone --branch stable --depth 1 --recursive https://github.com/NVIDIA/TransformerEngine.git /tmp/TransformerEngine && \
+    MAX_JOBS=${BUILD_JOBS} \
+    CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} \
+    NVTE_BUILD_THREADS_PER_JOB=${NVTE_BUILD_THREADS_PER_JOB} \
+    NVTE_FRAMEWORK=pytorch \
+    uv pip install --no-build-isolation /tmp/TransformerEngine && \
+    rm -rf /tmp/TransformerEngine
+
 # Install Megatron
 RUN . /opt/venv/bin/activate && MAX_JOBS=${BUILD_JOBS} \
     CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} \
     NVTE_BUILD_THREADS_PER_JOB=${NVTE_BUILD_THREADS_PER_JOB} \
     uv pip install -e .[qwen3_5] --no-build-isolation && \
     uv pip install megatron-core[mlm]==0.16.1 --no-build-isolation && \
-    uv pip install transformer-engine[pytorch]==2.14.0 && \
-    uv pip install transformer-engine[pytorch]==2.14.0 --no-build-isolation --no-cache-dir
-    # the above line is necessary, otherwise you may encounter "undefined symbol" error when importing transformer engine, so do not remove it even if it looks redundant
-
-RUN . /opt/venv/bin/activate && uv pip install git+https://github.com/ISEEKYAN/mbridge.git@90c4633a6cdcfe5d29723d7b145d32f6f5e73303 && \
+    uv pip install git+https://github.com/ISEEKYAN/mbridge.git@90c4633a6cdcfe5d29723d7b145d32f6f5e73303 && \
     NVCC_APPEND_FLAGS="--threads ${NVCC_THREADS}" APEX_PARALLEL_BUILD=${BUILD_JOBS} \
     uv pip install -v --no-build-isolation \
     --config-settings="--build-option=--cpp_ext" \

From 1d9506bf20ef230fd47795d0ec23efe2fd3974cd Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Thu, 30 Apr 2026 16:53:46 +0800
Subject: [PATCH 16/20] add perf tests

---
 tests/perf/resource_backends_test.py | 70 ++++++++++++++++++++++++++
 tests/perf/resource_sampler_test.py  | 75 ++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+)
 create mode 100644 tests/perf/resource_backends_test.py
 create mode 100644 tests/perf/resource_sampler_test.py

diff --git a/tests/perf/resource_backends_test.py b/tests/perf/resource_backends_test.py
new file mode 100644
index 00000000000..5018713bb66
--- /dev/null
+++ b/tests/perf/resource_backends_test.py
@@ -0,0 +1,70 @@
+"""Tests for NVML-backed perf resource sampling."""
+
+import unittest
+from types import SimpleNamespace
+from unittest.mock import patch
+
+from trinity.perf.resource_backends import SystemResourceBackend
+
+
+class FakeProcess:
+    def __init__(self):
+        self._cpu_values = iter([0.0, 12.5])
+
+    def cpu_percent(self, _interval=None):
+        return next(self._cpu_values)
+
+    def memory_info(self):
+        return SimpleNamespace(rss=256 * 1024 * 1024)
+
+    def memory_percent(self):
+        return 1.25
+
+
+class SystemResourceBackendTest(unittest.TestCase):
+    @patch("trinity.perf.resource_backends.time.sleep")
+    @patch("trinity.perf.resource_backends.nvmlDeviceGetName", return_value="GPU-0")
+    @patch("trinity.perf.resource_backends.nvmlDeviceGetHandleByIndex", return_value=object())
+    @patch("trinity.perf.resource_backends.nvmlDeviceGetCount", return_value=1)
+    @patch("trinity.perf.resource_backends.nvmlShutdown")
+    @patch("trinity.perf.resource_backends.nvmlInit")
+    @patch("trinity.perf.resource_backends.psutil.Process", return_value=FakeProcess())
+    def test_sample_keeps_peak_gpu_utilization_within_one_outer_sample(
+        self,
+        _mock_process,
+        _mock_nvml_init,
+        _mock_nvml_shutdown,
+        _mock_gpu_count,
+        _mock_gpu_handle,
+        _mock_gpu_name,
+        _mock_sleep,
+    ):
+        utilization_side_effect = [
+            SimpleNamespace(gpu=0.0),
+            SimpleNamespace(gpu=35.0),
+            SimpleNamespace(gpu=80.0),
+        ]
+        memory_side_effect = [
+            SimpleNamespace(used=100 * 1024 * 1024, total=500 * 1024 * 1024),
+            SimpleNamespace(used=120 * 1024 * 1024, total=500 * 1024 * 1024),
+            SimpleNamespace(used=110 * 1024 * 1024, total=500 * 1024 * 1024),
+        ]
+
+        with patch(
+            "trinity.perf.resource_backends.nvmlDeviceGetUtilizationRates",
+            side_effect=utilization_side_effect,
+        ), patch(
+            "trinity.perf.resource_backends.nvmlDeviceGetMemoryInfo",
+            side_effect=memory_side_effect,
+        ):
+            backend = SystemResourceBackend(
+                gpu_subsample_count=3,
+                gpu_subsample_interval_seconds=0.0,
+            )
+            backend.open()
+            sample = backend.sample()
+            backend.close()
+
+        self.assertEqual(sample.cpu_percent, 12.5)
+        self.assertEqual(sample.gpu_metrics[0].gpu_util_percent, 80.0)
+        self.assertEqual(sample.gpu_metrics[0].gpu_memory_used_mb, 120.0)
diff --git a/tests/perf/resource_sampler_test.py b/tests/perf/resource_sampler_test.py
new file mode 100644
index 00000000000..72dde364dc3
--- /dev/null
+++ b/tests/perf/resource_sampler_test.py
@@ -0,0 +1,75 @@
+"""Tests for perf resource timeline helpers."""
+
+import itertools
+import time
+import unittest
+
+from trinity.perf.report_utils import build_resource_timeline_payload
+from trinity.perf.resource_backends import GPUSample, ResourceSample
+from trinity.perf.resource_sampler import ResourceSampler
+
+
+class FakeBackend:
+    def __init__(self):
+        self.opened = False
+        self.closed = False
+        self.sample_index = itertools.count()
+
+    def open(self):
+        self.opened = True
+
+    def close(self):
+        self.closed = True
+
+    def sample(self):
+        index = next(self.sample_index)
+        return ResourceSample(
+            timestamp=1000.0 + index,
+            cpu_percent=50.0 + index,
+            memory_rss_mb=1024.0 + index,
+            memory_percent=20.0 + index,
+            gpu_metrics=[
+                GPUSample(
+                    gpu_id=0,
+                    name="GPU-0",
+                    gpu_util_percent=70.0 + index,
+                    gpu_memory_used_mb=16000.0 + index,
+                    gpu_memory_total_mb=24000.0,
+                ),
+                GPUSample(
+                    gpu_id=1,
+                    name="GPU-1",
+                    gpu_util_percent=75.0 + index,
+                    gpu_memory_used_mb=15000.0 + index,
+                    gpu_memory_total_mb=24000.0,
+                ),
+            ],
+        )
+
+
+class ResourceSamplerTest(unittest.TestCase):
+    def test_resource_sampler_collects_samples(self):
+        backend = FakeBackend()
+        sampler = ResourceSampler(interval_seconds=0.01, backend=backend)
+
+        sampler.start()
+        time.sleep(0.03)
+        samples = sampler.stop()
+
+        self.assertTrue(backend.opened)
+        self.assertTrue(backend.closed)
+        self.assertGreaterEqual(len(samples), 2)
+        self.assertEqual(samples[0].gpu_metrics[0].gpu_id, 0)
+
+    def test_build_resource_timeline_payload_keeps_cpu_single_line_and_gpu_per_device(self):
+        samples = [FakeBackend().sample(), FakeBackend().sample()]
+
+        payload = build_resource_timeline_payload(samples)
+
+        self.assertEqual(len(payload["resource_timeline"]), 2)
+        self.assertEqual(len(payload["chart_series"]["cpu_percent"]), 2)
+        self.assertEqual(set(payload["chart_series"]["gpu_util_percent"].keys()), {"0", "1"})
+        self.assertEqual(
+            payload["chart_series"]["gpu_memory_used_mb"]["0"]["name"],
+            "GPU-0",
+        )

From f7245d652844755784a4d4c66b7bd23a34adfb9d Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Thu, 30 Apr 2026 19:15:35 +0800
Subject: [PATCH 17/20] fix tests

---
 pyproject.toml                       | 2 +-
 tests/cli/launcher_test.py           | 2 +-
 tests/perf/resource_backends_test.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 733f44dc570..fa7abc1db6c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,7 +88,7 @@ megatron = [
     # "megatron-bridge==0.3.1",
 ]
 tinker = [
-    "tinker>=0.10.0; python_version >= '3.11'",
+    "tinker>=0.10.0,<=0.16.1; python_version >= '3.11'",
 ]
 
 doc = [
diff --git a/tests/cli/launcher_test.py b/tests/cli/launcher_test.py
index 5e5aa9e0089..337e17fe4dd 100644
--- a/tests/cli/launcher_test.py
+++ b/tests/cli/launcher_test.py
@@ -401,7 +401,7 @@ def test_debug_mode(self, mock_load):
             process.terminate()
 
     @mock.patch("trinity.manager.log_manager.LogManager")
-    @mock.patch("trinity.cli.launcher.load_config")
+    @mock.patch("trinity.cli.log.load_config")
     def test_log_mode(self, mock_load_config, mock_log_manager):
         result = runner.invoke(launcher.app, ["log"])
         self.assertNotEqual(result.exit_code, 0)
diff --git a/tests/perf/resource_backends_test.py b/tests/perf/resource_backends_test.py
index 5018713bb66..0b2f6e414be 100644
--- a/tests/perf/resource_backends_test.py
+++ b/tests/perf/resource_backends_test.py
@@ -11,7 +11,7 @@ class FakeProcess:
     def __init__(self):
         self._cpu_values = iter([0.0, 12.5])
 
-    def cpu_percent(self, _interval=None):
+    def cpu_percent(self, interval=None):
         return next(self._cpu_values)
 
     def memory_info(self):

From 66dfad7f0aad1fef20d4ced1168c779d30f63803 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Sun, 3 May 2026 17:44:54 +0800
Subject: [PATCH 18/20] update vllm to 0.20.1

---
 pyproject.toml                                   | 2 +-
 scripts/docker/Dockerfile.uv                     | 2 +-
 trinity/common/models/vllm_patch/__init__.py     | 6 +++---
 trinity/common/models/vllm_patch/worker_patch.py | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fa7abc1db6c..bbb92fb6b83 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,7 @@ trinity = "trinity.cli.launcher:main"
 
 [project.optional-dependencies]
 vllm = [
-    "vllm>=0.19.1,<=0.20.0",
+    "vllm>=0.19.1,<=0.20.1",
 ]
 data = [
     "py-data-juicer>=1.4.3"
diff --git a/scripts/docker/Dockerfile.uv b/scripts/docker/Dockerfile.uv
index 4da21404e32..58e62d7d100 100644
--- a/scripts/docker/Dockerfile.uv
+++ b/scripts/docker/Dockerfile.uv
@@ -41,7 +41,7 @@ RUN pip install uv && uv venv /opt/venv --python=python3.12
 # Install  Trinity-RFT
 RUN . /opt/venv/bin/activate && \
     uv pip install -e.[mm,dev,tinker,data,agent] && \
-    uv pip install vllm==0.20.0
+    uv pip install vllm==0.20.1
 
 # Install flash attention
 RUN . /opt/venv/bin/activate && \
diff --git a/trinity/common/models/vllm_patch/__init__.py b/trinity/common/models/vllm_patch/__init__.py
index fd9b895c8d0..f458ec65000 100644
--- a/trinity/common/models/vllm_patch/__init__.py
+++ b/trinity/common/models/vllm_patch/__init__.py
@@ -9,7 +9,7 @@
 
 VLLM_VERSION_0120 = parse_version("0.12.0")
 VLLM_VERSION_0170 = parse_version("0.17.0")
-VLLM_VERSION_0191 = parse_version("0.20.0")
+VLLM_VERSION_0201 = parse_version("0.20.1")
 
 
 def vllm_patch():
@@ -64,7 +64,7 @@ def _get_api_server_runner(vllm_version):
 
         return run_api_server_in_ray_actor_v13
 
-    if VLLM_VERSION_0170 <= vllm_version <= VLLM_VERSION_0191:
+    if VLLM_VERSION_0170 <= vllm_version <= VLLM_VERSION_0201:
         from trinity.common.models.vllm_patch.api_patch_v17 import (
             run_api_server_in_ray_actor_v17,
         )
@@ -73,7 +73,7 @@ def _get_api_server_runner(vllm_version):
 
     raise ValueError(
         f"Unsupported vLLM version: {vllm.__version__}. "
-        "This patch supports vLLM versions 0.12.0, (0.12.0, 0.17.0), and [0.17.0, 0.19.1]."
+        "This patch supports vLLM versions 0.12.0, (0.12.0, 0.17.0), and [0.17.0, 0.20.1]."
     )
 
 
diff --git a/trinity/common/models/vllm_patch/worker_patch.py b/trinity/common/models/vllm_patch/worker_patch.py
index 50dc163feb1..1f57cc5a7e8 100644
--- a/trinity/common/models/vllm_patch/worker_patch.py
+++ b/trinity/common/models/vllm_patch/worker_patch.py
@@ -13,10 +13,10 @@
 def patch_vllm_prompt_logprobs(model_runner: GPUModelRunner):  # noqa: C901
     """Patch vLLM model runner to support prompt logprobs extraction."""
     version = get_vllm_version()
-    if version < parse_version("0.10.2") or version > parse_version("0.20.0"):
+    if version < parse_version("0.10.2") or version > parse_version("0.20.1"):
         raise ValueError(
             f"Unsupported vllm version: {vllm.__version__}. "
-            "This patch requires vllm version >= 0.10.2, <= 0.20.0."
+            "This patch requires vllm version >= 0.10.2, <= 0.20.1."
         )
     is_v0102 = version == parse_version("0.10.2")
 

From 4818f8a928fdd45352f88c93f8eecc56a3f24220 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 6 May 2026 10:00:23 +0800
Subject: [PATCH 19/20] update docker compose

---
 .github/workflows/docker/docker-compose.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
index 3f3fcf4f21b..544916bdd93 100644
--- a/.github/workflows/docker/docker-compose.yaml
+++ b/.github/workflows/docker/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   trinity-node-1:
-    image: trinity-rft-unittest:20260501
+    image: trinity-rft-unittest:20260506
     cap_add:
       - SYS_PTRACE
     pull_policy: never
@@ -34,7 +34,7 @@ services:
             capabilities: [gpu]
 
   trinity-node-2:
-    image: trinity-rft-unittest:20260501
+    image: trinity-rft-unittest:20260506
     cap_add:
       - SYS_PTRACE
     pull_policy: never

From bdd01146aba44d7fb843c862142c372e8a7cfa81 Mon Sep 17 00:00:00 2001
From: pxc <xc_pan@foxmail.com>
Date: Wed, 6 May 2026 10:20:44 +0800
Subject: [PATCH 20/20] simplify perf

---
 tests/perf/resource_sampler_test.py | 23 +++++++----
 trinity/perf/__init__.py            |  2 -
 trinity/perf/report_utils.py        | 59 -----------------------------
 trinity/perf/report_viewer.py       | 42 +++++++++++++++++---
 trinity/perf/stage_perf.py          | 23 ++++++-----
 5 files changed, 63 insertions(+), 86 deletions(-)
 delete mode 100644 trinity/perf/report_utils.py

diff --git a/tests/perf/resource_sampler_test.py b/tests/perf/resource_sampler_test.py
index 72dde364dc3..dbf624e88a0 100644
--- a/tests/perf/resource_sampler_test.py
+++ b/tests/perf/resource_sampler_test.py
@@ -3,9 +3,13 @@
 import itertools
 import time
 import unittest
+from typing import cast
 
-from trinity.perf.report_utils import build_resource_timeline_payload
-from trinity.perf.resource_backends import GPUSample, ResourceSample
+from trinity.perf.resource_backends import (
+    GPUSample,
+    ResourceSample,
+    SystemResourceBackend,
+)
 from trinity.perf.resource_sampler import ResourceSampler
 
 
@@ -50,7 +54,10 @@ def sample(self):
 class ResourceSamplerTest(unittest.TestCase):
     def test_resource_sampler_collects_samples(self):
         backend = FakeBackend()
-        sampler = ResourceSampler(interval_seconds=0.01, backend=backend)
+        sampler = ResourceSampler(
+            interval_seconds=0.01,
+            backend=cast(SystemResourceBackend, backend),
+        )
 
         sampler.start()
         time.sleep(0.03)
@@ -61,15 +68,15 @@ def test_resource_sampler_collects_samples(self):
         self.assertGreaterEqual(len(samples), 2)
         self.assertEqual(samples[0].gpu_metrics[0].gpu_id, 0)
 
-    def test_build_resource_timeline_payload_keeps_cpu_single_line_and_gpu_per_device(self):
+    def test_resource_samples_serialize_cpu_single_line_and_gpu_per_device(self):
         samples = [FakeBackend().sample(), FakeBackend().sample()]
 
-        payload = build_resource_timeline_payload(samples)
+        payload = {"resource_timeline": [sample.to_dict() for sample in samples]}
 
         self.assertEqual(len(payload["resource_timeline"]), 2)
-        self.assertEqual(len(payload["chart_series"]["cpu_percent"]), 2)
-        self.assertEqual(set(payload["chart_series"]["gpu_util_percent"].keys()), {"0", "1"})
+        self.assertEqual(payload["resource_timeline"][0]["cpu_percent"], 50.0)
+        self.assertEqual(len(payload["resource_timeline"][0]["gpu_metrics"]), 2)
         self.assertEqual(
-            payload["chart_series"]["gpu_memory_used_mb"]["0"]["name"],
+            payload["resource_timeline"][0]["gpu_metrics"][0]["name"],
             "GPU-0",
         )
diff --git a/trinity/perf/__init__.py b/trinity/perf/__init__.py
index 1af3fa790c7..f2d9587344a 100644
--- a/trinity/perf/__init__.py
+++ b/trinity/perf/__init__.py
@@ -1,6 +1,5 @@
 """Performance tooling package for Trinity."""
 
-from .report_utils import build_resource_timeline_payload
 from .resource_sampler import ResourceSampler
 from .stage_perf import (
     ExplorerPerfOptions,
@@ -13,7 +12,6 @@
     "ExplorerPerfOptions",
     "ResourceSampler",
     "TensorBoardScalarReader",
-    "build_resource_timeline_payload",
     "collect_step_metrics",
     "run_explorer_perf",
     "write_explorer_perf_output",
diff --git a/trinity/perf/report_utils.py b/trinity/perf/report_utils.py
deleted file mode 100644
index 7cc5bd2a066..00000000000
--- a/trinity/perf/report_utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""Reporting helpers for performance tooling."""
-
-from __future__ import annotations
-
-from collections import defaultdict
-
-from trinity.perf.resource_backends import ResourceSample
-
-
-def build_resource_timeline_payload(samples: list[ResourceSample]) -> dict:
-    """Convert raw resource samples into a chart-friendly timeline payload."""
-    timeline = [sample.to_dict() for sample in samples]
-    cpu_series = [
-        {"timestamp": sample.timestamp, "value": sample.cpu_percent} for sample in samples
-    ]
-    memory_rss_series = [
-        {"timestamp": sample.timestamp, "value": sample.memory_rss_mb} for sample in samples
-    ]
-    memory_percent_series = [
-        {"timestamp": sample.timestamp, "value": sample.memory_percent} for sample in samples
-    ]
-
-    gpu_util_series: dict[int, list[dict]] = defaultdict(list)
-    gpu_memory_series: dict[int, list[dict]] = defaultdict(list)
-    gpu_names: dict[int, str] = {}
-    for sample in samples:
-        for gpu_sample in sample.gpu_metrics:
-            gpu_names[gpu_sample.gpu_id] = gpu_sample.name
-            gpu_util_series[gpu_sample.gpu_id].append(
-                {"timestamp": sample.timestamp, "value": gpu_sample.gpu_util_percent}
-            )
-            gpu_memory_series[gpu_sample.gpu_id].append(
-                {"timestamp": sample.timestamp, "value": gpu_sample.gpu_memory_used_mb}
-            )
-
-    return {
-        "resource_timeline": timeline,
-        "chart_series": {
-            "cpu_percent": cpu_series,
-            "memory_rss_mb": memory_rss_series,
-            "memory_percent": memory_percent_series,
-            "gpu_util_percent": {
-                str(gpu_id): {
-                    "gpu_id": gpu_id,
-                    "name": gpu_names[gpu_id],
-                    "values": values,
-                }
-                for gpu_id, values in gpu_util_series.items()
-            },
-            "gpu_memory_used_mb": {
-                str(gpu_id): {
-                    "gpu_id": gpu_id,
-                    "name": gpu_names[gpu_id],
-                    "values": values,
-                }
-                for gpu_id, values in gpu_memory_series.items()
-            },
-        },
-    }
diff --git a/trinity/perf/report_viewer.py b/trinity/perf/report_viewer.py
index 1aa1c52f694..415bfe74ca5 100644
--- a/trinity/perf/report_viewer.py
+++ b/trinity/perf/report_viewer.py
@@ -151,6 +151,38 @@ def build_elapsed_series(series: list[dict[str, Any]]) -> tuple[list[float], lis
     return x_values, y_values
 
 
+def build_scalar_timeline_series(
+    timeline: list[dict[str, Any]], metric_key: str
+) -> list[dict[str, float]]:
+    return [
+        {"timestamp": sample["timestamp"], "value": sample[metric_key]}
+        for sample in timeline
+        if sample.get(metric_key) is not None
+    ]
+
+
+def build_gpu_timeline_series(
+    timeline: list[dict[str, Any]], metric_key: str
+) -> dict[str, dict[str, Any]]:
+    series_by_gpu: dict[str, dict[str, Any]] = {}
+    for sample in timeline:
+        timestamp = sample.get("timestamp")
+        for gpu_sample in sample.get("gpu_metrics", []):
+            if gpu_sample.get(metric_key) is None:
+                continue
+            gpu_key = str(gpu_sample.get("gpu_id"))
+            gpu_payload = series_by_gpu.setdefault(
+                gpu_key,
+                {
+                    "gpu_id": gpu_sample.get("gpu_id"),
+                    "name": gpu_sample.get("name"),
+                    "values": [],
+                },
+            )
+            gpu_payload["values"].append({"timestamp": timestamp, "value": gpu_sample[metric_key]})
+    return series_by_gpu
+
+
 def render_line_chart(
     title: str,
     x_values: list[float],
@@ -284,21 +316,21 @@ def render_step_metrics(report: dict[str, Any]) -> None:
 
 def render_resource_utilization(report: dict[str, Any]) -> None:
     st.header("Resource Utilization")
-    chart_series = report.get("chart_series", {})
+    resource_timeline = report.get("resource_timeline", [])
 
-    cpu_series = chart_series.get("cpu_percent", [])
+    cpu_series = build_scalar_timeline_series(resource_timeline, "cpu_percent")
     cpu_x, cpu_y = build_elapsed_series(cpu_series)
 
-    memory_series = chart_series.get(MEMORY_SERIES_KEY, [])
+    memory_series = build_scalar_timeline_series(resource_timeline, MEMORY_SERIES_KEY)
     memory_x, memory_y = build_elapsed_series(memory_series)
 
-    gpu_util_series = chart_series.get("gpu_util_percent", {})
+    gpu_util_series = build_gpu_timeline_series(resource_timeline, "gpu_util_percent")
     gpu_util_x: list[float] = []
     gpu_util_y: dict[str, list[float]] = {}
     for gpu_payload in gpu_util_series.values():
         gpu_util_x, values = build_elapsed_series(gpu_payload.get("values", []))
         gpu_util_y[gpu_series_label(gpu_payload)] = values
-    gpu_memory_series = chart_series.get("gpu_memory_used_mb", {})
+    gpu_memory_series = build_gpu_timeline_series(resource_timeline, "gpu_memory_used_mb")
     gpu_memory_x: list[float] = []
     gpu_memory_y: dict[str, list[float]] = {}
     for gpu_payload in gpu_memory_series.values():
diff --git a/trinity/perf/stage_perf.py b/trinity/perf/stage_perf.py
index f4b3382ba41..7185c26ebaa 100644
--- a/trinity/perf/stage_perf.py
+++ b/trinity/perf/stage_perf.py
@@ -13,7 +13,6 @@
 
 from trinity.buffer.pipelines.task_pipeline import check_and_run_task_pipeline
 from trinity.common.config import Config, load_config
-from trinity.perf.report_utils import build_resource_timeline_payload
 from trinity.perf.resource_sampler import ResourceSampler
 from trinity.perf.tensorboard_metrics import (
     TensorBoardScalarReader,
@@ -26,7 +25,7 @@
 class ExplorerPerfOptions:
     config_path: str
     output_path: str
-    monitor_interval: float = 5.0
+    monitor_interval: float = 2.0
     total_steps: int = 5
     timeout: Optional[float] = None
 
@@ -39,11 +38,11 @@ def validate_explorer_perf_config(config: Config) -> None:
 
 def build_explorer_perf_payload(
     *,
-    config: Config,
+    config: Optional[Config],
     options: ExplorerPerfOptions,
-    startup_time_sec: float,
-    execution_time_sec: float,
-    total_time_sec: float,
+    startup_time_sec: Optional[float],
+    execution_time_sec: Optional[float],
+    total_time_sec: Optional[float],
     resource_payload: dict[str, Any],
     step_metrics: list[dict[str, Any]],
     success: bool,
@@ -102,13 +101,13 @@ def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
     from trinity.cli.launcher import explore
 
     load_plugins()
-    config: Config = None
+    config: Optional[Config] = None
     sampler: Optional[ResourceSampler] = None
     error: Optional[str] = None
-    startup_time_sec: float = None
-    execution_time_sec: float = None
-    total_time_sec: float = None
-    resource_payload: dict[str, Any] = {"resource_timeline": [], "chart_series": {}}
+    startup_time_sec: Optional[float] = None
+    execution_time_sec: Optional[float] = None
+    total_time_sec: Optional[float] = None
+    resource_payload: dict[str, Any] = {"resource_timeline": []}
     step_metrics: list[dict[str, Any]] = []
 
     try:
@@ -142,7 +141,7 @@ def run_explorer_perf(options: ExplorerPerfOptions) -> dict[str, Any]:
         raise e
     finally:
         collected_samples = sampler.stop() if sampler is not None else []
-        resource_payload = build_resource_timeline_payload(collected_samples)
+        resource_payload = {"resource_timeline": [sample.to_dict() for sample in collected_samples]}
 
         if config is not None:
             tensorboard_dir = os.path.join(