diff --git a/examples/pi0/README.md b/examples/pi0/README.md
index 13f02422dd..ffcdcba9d2 100644
--- a/examples/pi0/README.md
+++ b/examples/pi0/README.md
@@ -157,7 +157,7 @@ Configure the following fields:
 - `system.scheduler.decay_lr` - Final learning rate after decay (default: `2.5e-6`)
 - `system.checkpoint.save_checkpoint` - Whether to save checkpoints (default: `true`)
 - `system.checkpoint.save_freq` - Steps between checkpoints (default: `1000`)
-- `system.checkpoint.output_directory` - Checkpoint output directory (default: `${experiment.exp_dir}/ckpt`)
+- `system.checkpoint.output_directory` - Checkpoint output directory (default: `${experiment.exp_dir}`)
 
 **Model settings**:
 - `model.model_name` - Model name: `"pi0"` or `"pi0.5"`
@@ -186,7 +186,7 @@ python run.py --config-path ./examples/pi0/conf --config-name train action=run
 
 Training logs are saved to `outputs/pi0_train/logs/host_0_localhost.output` by default.
 
-Checkpoints are saved to `${experiment.exp_dir}/ckpt` (default: `outputs/pi0_train/ckpt`).
+Checkpoints are saved to `${experiment.exp_dir}/checkpoints` (default: `outputs/pi0_train/checkpoints`).
 
 ### Stop Training
 ```sh
diff --git a/examples/pi0/conf/train/pi0.yaml b/examples/pi0/conf/train/pi0.yaml
index 88448f489c..75ad610dd9 100644
--- a/examples/pi0/conf/train/pi0.yaml
+++ b/examples/pi0/conf/train/pi0.yaml
@@ -20,7 +20,7 @@ system:
     decay_lr: 2.5e-6
 
   checkpoint:
-    output_directory: ${experiment.exp_dir}/ckpt
+    output_directory: ${experiment.exp_dir}
     # Whether to save checkpoint
     save_checkpoint: true
     # Number of steps between checkpoints
diff --git a/examples/pi0_5/README.md b/examples/pi0_5/README.md
index e345d25e45..4c177e5ed1 100644
--- a/examples/pi0_5/README.md
+++ b/examples/pi0_5/README.md
@@ -164,7 +164,7 @@ Configure the following fields:
 - `system.scheduler.decay_lr` - Final learning rate after decay (default: `2.5e-6`)
 - `system.checkpoint.save_checkpoint` - Whether to save checkpoints (default: `true`)
 - `system.checkpoint.save_freq` - Steps between checkpoints (default: `1000`)
-- `system.checkpoint.output_directory` - Checkpoint output directory (default: `${experiment.exp_dir}/ckpt`)
+- `system.checkpoint.output_directory` - Checkpoint output directory (default: `${experiment.exp_dir}`)
 
 **Model settings**:
 - `model.model_name` - Model name: `"pi0.5"`
@@ -193,7 +193,7 @@ python run.py --config-path ./examples/pi0_5/conf --config-name train action=run
 
 Training logs are saved to `outputs/pi0_5_train/logs/host_0_localhost.output` by default.
 
-Checkpoints are saved to `${experiment.exp_dir}/ckpt` (default: `outputs/pi0_5_train/ckpt`).
+Checkpoints are saved to `${experiment.exp_dir}/checkpoints` (default: `outputs/pi0_5_train/checkpoints`).
 
 ### Stop Training
 ```sh
diff --git a/examples/pi0_5/conf/train/pi0_5.yaml b/examples/pi0_5/conf/train/pi0_5.yaml
index 2027c1a6cb..4599321e6b 100644
--- a/examples/pi0_5/conf/train/pi0_5.yaml
+++ b/examples/pi0_5/conf/train/pi0_5.yaml
@@ -20,7 +20,7 @@ system:
     decay_lr: 2.5e-6
 
   checkpoint:
-    output_directory: ${experiment.exp_dir}/ckpt
+    output_directory: ${experiment.exp_dir}
     # Whether to save checkpoint
     save_checkpoint: true
     # Number of steps between checkpoints
diff --git a/examples/qwen_gr00t/README.md b/examples/qwen_gr00t/README.md
new file mode 100644
index 0000000000..13f02422dd
--- /dev/null
+++ b/examples/qwen_gr00t/README.md
@@ -0,0 +1,343 @@
+# PI0: Training, Inference, and Serving
+
+This guide covers how to train, run inference, and serve PI0 models using FlagScale.
+
+## Installation
+
+### Clone Repository
+
+```sh
+git clone https://github.com/FlagOpen/FlagScale.git
+cd FlagScale/
+```
+
+### Setup Conda Environment
+
+Create a new conda environment for robotics training:
+
+```sh
+conda create -n flagos-robo python=3.12
+conda activate flagos-robo
+```
+
+Install FlagScale and robotics dependencies:
+
+```sh
+cd FlagScale/
+pip install . --verbose
+pip install -r requirements/train/robotics/requirements.txt
+```
+
+Install additional dependencies for downloading models/datasets:
+
+```sh
+# For HuggingFace Hub
+pip install huggingface_hub
+
+# For ModelScope (optional)
+pip install modelscope
+```
+
+## Download Models and Tokenizers
+
+Download models and tokenizers using the provided script. Choose either HuggingFace Hub or ModelScope based on your preference:
+
+**Using HuggingFace Hub:**
+
+```sh
+cd FlagScale/
+python examples/pi0/download.py \
+    --repo_id lerobot/pi0_base \
+    --output_dir /workspace/models \
+    --source huggingface
+
+python examples/pi0/download.py \
+    --repo_id google/paligemma-3b-pt-224 \
+    --output_dir /workspace/models \
+    --source huggingface
+```
+
+**Using ModelScope:**
+
+```sh
+cd FlagScale/
+python examples/pi0/download.py \
+    --repo_id lerobot/pi0_base \
+    --output_dir /workspace/models \
+    --source modelscope
+
+python examples/pi0/download.py \
+    --repo_id google/paligemma-3b-pt-224 \
+    --output_dir /workspace/models \
+    --source modelscope
+```
+
+The models will be downloaded to (example with `/workspace/models`):
+- `/workspace/models/lerobot/pi0_base`
+- `/workspace/models/google/paligemma-3b-pt-224`
+
+
+## Training
+
+### Prepare Dataset
+
+FlagScale uses the **LeRobotDataset v3.0** format. For detailed information about the format structure, see the [LeRobotDataset v3.0 documentation](https://huggingface.co/docs/lerobot/en/lerobot-dataset-v3).
+
+For example, to download the `aloha_mobile_cabinet` dataset:
+
+**Using HuggingFace Hub:**
+
+```sh
+cd FlagScale/
+python examples/pi0/download.py \
+    --repo_id lerobot/aloha_mobile_cabinet \
+    --output_dir /workspace/datasets \
+    --repo_type dataset \
+    --source huggingface
+```
+
+**Using ModelScope:**
+
+```sh
+cd FlagScale/
+python examples/pi0/download.py \
+    --repo_id lerobot/aloha_mobile_cabinet \
+    --output_dir /workspace/datasets \
+    --repo_type dataset \
+    --source modelscope
+```
+
+The dataset will be downloaded to (example with `/workspace/datasets`):
+- `/workspace/datasets/lerobot/aloha_mobile_cabinet`
+
+### Edit Config
+
+FlagScale uses a two-level configuration system:
+
+1. **Experiment-level config** (`examples/pi0/conf/train.yaml`): Defines experiment settings, environment variables, and resource allocation
+2. **Task-level config** (`examples/pi0/conf/train/pi0.yaml`): Defines model, dataset, and training hyperparameters
+
+#### Experiment-Level Config
+
+Edit the experiment-level config for multi-GPU training:
+
+```sh
+cd FlagScale/
+vim examples/pi0/conf/train.yaml
+```
+
+Configure the following fields:
+
+- `experiment.envs.CUDA_VISIBLE_DEVICES` - GPU devices to use (e.g., `"0,1,2,3"` for 4 GPUs, `"0,1"` for 2 GPUs)
+- `experiment.envs.CUDA_DEVICE_MAX_CONNECTIONS` - Connection limit (typically `1`)
+- `experiment.exp_name` - Experiment name
+- `experiment.exp_dir` - Output directory for checkpoints and logs
+
+#### Task-Level Config
+
+Edit the task-level config for model and training settings:
+
+```sh
+cd FlagScale/
+vim examples/pi0/conf/train/pi0.yaml
+```
+
+Configure the following fields:
+
+**System settings** (training hyperparameters):
+- `system.batch_size` - Batch size per GPU
+- `system.train_steps` - Total training steps
+- `system.optimizer.name` - Optimizer name (default: `"AdamW"`)
+- `system.optimizer.lr` - Learning rate (default: `2.5e-5`)
+- `system.optimizer.betas` - Optimizer betas (default: `[0.9, 0.95]`)
+- `system.optimizer.eps` - Optimizer epsilon (default: `1.0e-8`)
+- `system.optimizer.weight_decay` - Weight decay (default: `0.01`)
+- `system.scheduler.warmup_steps` - Warmup steps (default: `1000`)
+- `system.scheduler.decay_steps` - Decay steps (default: `30000`)
+- `system.scheduler.decay_lr` - Final learning rate after decay (default: `2.5e-6`)
+- `system.checkpoint.save_checkpoint` - Whether to save checkpoints (default: `true`)
+- `system.checkpoint.save_freq` - Steps between checkpoints (default: `1000`)
+- `system.checkpoint.output_directory` - Checkpoint output directory (default: `${experiment.exp_dir}/ckpt`)
+
+**Model settings**:
+- `model.model_name` - Model name: `"pi0"` or `"pi0.5"`
+- `model.checkpoint_dir` - Path to pretrained model (e.g., `/workspace/models/lerobot/pi0_base`)
+- `model.tokenizer_path` - Path to tokenizer (e.g., `/workspace/models/google/paligemma-3b-pt-224`)
+- `model.tokenizer_max_length` - Maximum tokenizer sequence length
+- `model.action_steps` - Number of action steps to predict
+
+**Data settings**:
+- `data.data_path` - Path to LeRobot dataset root (e.g., `/workspace/datasets/lerobot/aloha_mobile_cabinet`)
+- `data.use_imagenet_stats` - Whether to use ImageNet normalization stats (default: `true`)
+- `data.rename_map` - Dictionary mapping dataset keys to policy keys (optional). Check the `features` key in your dataset's `meta/info.json` file to determine the correct mapping:
+  ```yaml
+  rename_map:
+    observation.images.cam_high: observation.images.base_0_rgb
+    observation.images.cam_left_wrist: observation.images.left_wrist_0_rgb
+    observation.images.cam_right_wrist: observation.images.right_wrist_0_rgb
+  ```
+- `data.use_quantiles` - Whether to use quantile normalization (for `pi0.5`, set to `false` to use MEAN_STD normalization)
+
+### Start Training
+```sh
+cd FlagScale/
+python run.py --config-path ./examples/pi0/conf --config-name train action=run
+```
+
+Training logs are saved to `outputs/pi0_train/logs/host_0_localhost.output` by default.
+
+Checkpoints are saved to `${experiment.exp_dir}/ckpt` (default: `outputs/pi0_train/ckpt`).
+
+### Stop Training
+```sh
+cd FlagScale/
+python run.py --config-path ./examples/pi0/conf --config-name train action=stop
+```
+
+## Inference
+
+### Prepare Inference Inputs
+
+You can extract inference inputs (images, state, task) from a dataset using the provided script:
+
+```sh
+cd FlagScale/
+python examples/pi0/dump_dataset_inputs.py \
+    --dataset_root /workspace/datasets/lerobot/aloha_mobile_cabinet \
+    --output_dir ./inference_inputs \
+    --frame_index 100
+```
+
+This will create:
+- `frame_100_observation_images_*.jpg` - Image files
+- `frame_100_state.pt` - State tensor
+- `frame_100_task.txt` - Task prompt
+- `extraction_summary.json` - Summary of extracted files
+
+Alternatively, you can extract from a specific episode and frame:
+
+```sh
+python examples/pi0/dump_dataset_inputs.py \
+    --dataset_root /workspace/datasets/lerobot/aloha_mobile_cabinet \
+    --output_dir ./inference_inputs \
+    --episode_index 0 \
+    --frame_in_episode 50
+```
+
+Or extract multiple samples at once:
+
+```sh
+python examples/pi0/dump_dataset_inputs.py \
+    --dataset_root /workspace/datasets/lerobot/aloha_mobile_cabinet \
+    --output_dir ./inference_inputs \
+    --frame_indices 100 200 300
+```
+
+### Edit Config
+
+```sh
+cd FlagScale/
+vim examples/pi0/conf/inference/pi0.yaml
+```
+
+Configure the following fields:
+
+**Engine settings:**
+- `engine.model` - Path to pretrained model (e.g., `/workspace/models/lerobot/pi0_base`)
+- `engine.tokenizer` - Path to tokenizer (e.g., `/workspace/models/google/paligemma-3b-pt-224`)
+- `engine.stat_path` - Path to dataset statistics (e.g., `/workspace/datasets/lerobot/aloha_mobile_cabinet/meta/stats.json`)
+- `engine.device` - Device to use (e.g., `"cuda"`)
+
+**Generate settings:**
+- `generate.images` - Dictionary mapping image keys to file paths:
+  ```yaml
+  images:
+    observation.images.cam_high: /path/to/image1.jpg
+    observation.images.cam_left_wrist: /path/to/image2.jpg
+    observation.images.cam_right_wrist: /path/to/image3.jpg
+  ```
+- `generate.state_path` - Path to state tensor file (`.pt` file)
+- `generate.task_path` - Path to task prompt file (`.txt` file)
+- `generate.rename_map` (optional) - Map input keys to policy expected keys:
+  ```yaml
+  rename_map:
+    observation.images.cam_high: observation.images.base_0_rgb
+    observation.images.cam_left_wrist: observation.images.left_wrist_0_rgb
+    observation.images.cam_right_wrist: observation.images.right_wrist_0_rgb
+  ```
+
+### Run Inference
+
+```sh
+cd FlagScale/
+python run.py \
+    --config-path ./examples/pi0/conf \
+    --config-name inference \
+    action=run
+```
+
+Inference logs are saved to `outputs/pi0_inference/inference_logs/host_0_localhost.output` by default.
+
+The predicted action tensor is printed to the console and saved in the log file.
+
+## Serving
+
+### Edit Config
+
+```sh
+cd FlagScale/
+vim examples/pi0/conf/serve/pi0.yaml
+```
+
+Configure the following fields:
+
+**Engine arguments:**
+- `engine_args.host` - Server host (default: `"0.0.0.0"`)
+- `engine_args.port` - Server port (default: `5000`)
+- `engine_args.model` - Path to pretrained model (e.g., `/workspace/models/lerobot/pi0_base`)
+- `engine_args.tokenizer` - Path to tokenizer (e.g., `/workspace/models/google/paligemma-3b-pt-224`)
+- `engine_args.stat_path` - Path to dataset statistics (e.g., `/workspace/datasets/lerobot/aloha_mobile_cabinet/meta/stats.json`)
+- `engine_args.device` - Device to use (e.g., `"cuda"`)
+- `engine_args.images_keys` - List of image keys expected by the model (do not change):
+  ```yaml
+  images_keys:
+    - observation.images.base_0_rgb
+    - observation.images.left_wrist_0_rgb
+    - observation.images.right_wrist_0_rgb
+  ```
+- `engine_args.images_shape` - Image shape `[C, H, W]` for warmup (e.g., `[3, 480, 640]`)
+- `engine_args.state_key` - Key for state in the batch (e.g., `"observation.state"`)
+
+### Run Serving
+
+```sh
+cd FlagScale/
+python run.py --config-path ./examples/pi0/conf --config-name serve action=run
+```
+
+Serving logs are saved to `outputs/pi0_serve/logs/host_0_localhost.output` by default.
+
+### Stop Serving
+
+```sh
+cd FlagScale/
+python run.py --config-path ./examples/pi0/conf --config-name serve action=stop
+```
+
+### Test Server with Client
+
+The client should send images using keys that match the `images_keys` in the config. For example, if using the default config:
+
+```sh
+cd FlagScale/
+python examples/pi0/client_pi0.py \
+  --host 127.0.0.1 \
+  --port 5000 \
+  --img1 ./inference_inputs/frame_100_observation_images_cam_high.jpg \
+  --img2 ./inference_inputs/frame_100_observation_images_cam_left_wrist.jpg \
+  --img3 ./inference_inputs/frame_100_observation_images_cam_right_wrist.jpg \
+  --state-path ./inference_inputs/frame_100_state.pt \
+  --instruction "Grab the orange and put it into the basket."
+```
+
+**Note**: The client must send image keys that match the `engine_args.images_keys` in the config.
diff --git a/examples/qwen_gr00t/client_pi0.py b/examples/qwen_gr00t/client_pi0.py
new file mode 100644
index 0000000000..4074ad3839
--- /dev/null
+++ b/examples/qwen_gr00t/client_pi0.py
@@ -0,0 +1,129 @@
+import argparse
+import base64
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import requests
+import torch
+
+
+def encode_image(path: str) -> str:
+    """Read image as base64 string."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Image not found: {path.resolve()}")
+    return base64.b64encode(path.read_bytes()).decode("utf-8")
+
+
+def check_health(base_url: str) -> None:
+    """Ping /health; raise RuntimeError if unhealthy."""
+    try:
+        r = requests.get(f"{base_url}/health", timeout=5)
+        r.raise_for_status()
+    except Exception as e:
+        raise RuntimeError(f"Health-check request failed: {e}") from e
+
+    data = r.json()
+    if not (data.get("status") == "healthy" and data.get("model_loaded")):
+        raise RuntimeError(f"Server not ready: {json.dumps(data, indent=2)}")
+    print(f"[√] Server healthy - GPU: {data['gpu_info']['device_name']}")
+
+
+def load_state_from_file(state_path: str) -> np.ndarray:
+    """Load state tensor from file and convert to numpy array.
+
+    Args:
+        state_path: Path to state file (.pt file)
+
+    Returns:
+        State array with shape (1, state_dim)
+    """
+    state = torch.load(state_path, map_location="cpu")
+    if isinstance(state, torch.Tensor):
+        state = state.numpy()
+    # Ensure shape is (1, state_dim)
+    if state.ndim == 1:
+        state = state[np.newaxis, :]
+    return state
+
+
+def build_payload(args) -> dict[str, Any]:
+    """Construct JSON payload for /infer.
+
+    The client must send images with keys matching the config's images_keys.
+    Default keys are:
+    - observation.images.base_0_rgb
+    - observation.images.left_wrist_0_rgb
+    - observation.images.right_wrist_0_rgb
+    """
+    # Encode images with keys matching config images_keys
+    img_sample = {
+        "observation.images.base_0_rgb": encode_image(args.img1),
+        "observation.images.left_wrist_0_rgb": encode_image(args.img2),
+        "observation.images.right_wrist_0_rgb": encode_image(args.img3),
+    }
+    # Load state from file
+    state = load_state_from_file(args.state_path)
+    state = state.tolist()
+
+    return {"instruction": args.instruction, "state": state, "images": [img_sample]}
+
+
+def pretty_print_resp(resp: requests.Response) -> None:
+    """Nicely print JSON or raw content."""
+    try:
+        print(json.dumps(resp.json(), indent=2, ensure_ascii=False))
+    except ValueError:
+        print(resp.text)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Client for RoboBrain-Robotics inference API")
+    parser.add_argument(
+        "--host", default="127.0.0.1", help="Host of local SSH tunnel (default: 127.0.0.1)"
+    )
+    parser.add_argument(
+        "--port", type=int, default=5000, help="Port of local SSH tunnel (default: 15000)"
+    )
+    parser.add_argument("--img1", required=True, help="Path to first camera RGB image")
+    parser.add_argument("--img2", required=True, help="Path to second camera RGB image")
+    parser.add_argument("--img3", required=True, help="Path to third camera RGB image")
+    parser.add_argument(
+        "--state-path",
+        required=True,
+        help="Path to state tensor file (.pt file) with shape (1, state_dim)",
+    )
+    parser.add_argument(
+        "--instruction",
+        default="Grab the orange and put it into the basket.",
+        help="Task instruction for the robot",
+    )
+    args = parser.parse_args()
+
+    base_url = f"http://{args.host}:{args.port}"
+    print(f"-> Using endpoint: {base_url}")
+
+    payload = build_payload(args)
+    try:
+        t0 = time.time()
+        resp = requests.post(
+            f"{base_url}/infer",
+            headers={"Content-Type": "application/json"},
+            data=json.dumps(payload),
+            timeout=300,
+        )
+        elapsed = (time.time() - t0) * 1000
+        resp.raise_for_status()
+    except requests.RequestException as e:
+        print(f"[Error] HTTP request failed: {e}")
+        sys.exit(1)
+    print(f"[√] Response OK ({resp.status_code})  -  {elapsed:.1f}ms")
+    pretty_print_resp(resp)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/qwen_gr00t/conf/inference.yaml b/examples/qwen_gr00t/conf/inference.yaml
new file mode 100644
index 0000000000..36d3686823
--- /dev/null
+++ b/examples/qwen_gr00t/conf/inference.yaml
@@ -0,0 +1,26 @@
+defaults:
+  - _self_
+  - inference: qwen_gr00t
+
+experiment:
+  exp_name: qwen_gr00t_inference
+  exp_dir: outputs/${experiment.exp_name}
+  model: /models/qwen_gr00t
+  task:
+    type: inference
+    backend: vllm # TODO: Remove this restriction
+    entrypoint: flagscale/inference/inference_qwen_gr00t.py
+  runner:
+    hostfile: null
+  cmds:
+    before_start: null
+  envs:
+    CUDA_VISIBLE_DEVICES: 2
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    # Optionally, set HF_HOME and HF_ENDPOINT
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/examples/qwen_gr00t/conf/inference/qwen_gr00t.yaml b/examples/qwen_gr00t/conf/inference/qwen_gr00t.yaml
new file mode 100644
index 0000000000..43890a76f6
--- /dev/null
+++ b/examples/qwen_gr00t/conf/inference/qwen_gr00t.yaml
@@ -0,0 +1,11 @@
+engine:
+  model_variant: "QwenGr00t"
+  model: /share/project/fengyupu/github/FlagScale_2/outputs/qwen_gr00t_train/20260207_110505.701567_ckpt/last
+  device: "cuda"
+
+generate:
+  images:
+    observation.images.wrist_image: qwen_gr00t_inference_inputs/frame_100_observation_images_wrist_image.jpg
+    observation.images.image: qwen_gr00t_inference_inputs/frame_100_observation_images_image.jpg
+  state_path: qwen_gr00t_inference_inputs/frame_100_state.pt
+  task_path: qwen_gr00t_inference_inputs/frame_100_task.txt
diff --git a/examples/qwen_gr00t/conf/serve.yaml b/examples/qwen_gr00t/conf/serve.yaml
new file mode 100644
index 0000000000..d8f02ef771
--- /dev/null
+++ b/examples/qwen_gr00t/conf/serve.yaml
@@ -0,0 +1,23 @@
+defaults:
+- _self_
+- serve: qwen_gr00t
+
+experiment:
+  exp_name: qwen_gr00t_serve_2
+  exp_dir: outputs/${experiment.exp_name}
+  task:
+    type: serve
+    entrypoint: flagscale/serve/run_serve_qwen_gr00t.py
+  runner:
+    hostfile: null
+    deploy:
+      use_fs_serve: false
+  envs:
+    CUDA_VISIBLE_DEVICES: 3
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/examples/qwen_gr00t/conf/serve/qwen_gr00t.yaml b/examples/qwen_gr00t/conf/serve/qwen_gr00t.yaml
new file mode 100644
index 0000000000..2f2dd73173
--- /dev/null
+++ b/examples/qwen_gr00t/conf/serve/qwen_gr00t.yaml
@@ -0,0 +1,14 @@
+- serve_id: vllm_model # Not in use
+  engine_args:
+    host: 0.0.0.0
+    port: 6000
+    model_variant: QwenGr00t
+    model: /share/project/fengyupu/github/FlagScale_2/outputs/qwen_gr00t_train/20260208_112711.741406_ckpt/last
+    device: "cuda"
+    images_keys:
+      - observation.images.base_0_rgb
+      - observation.images.left_wrist_0_rgb
+      - observation.images.right_wrist_0_rgb
+    # Only used for warmup
+    images_shape: [3, 480, 640]
+    state_key: observation.state
diff --git a/examples/qwen_gr00t/conf/train.yaml b/examples/qwen_gr00t/conf/train.yaml
new file mode 100644
index 0000000000..2f537a9a58
--- /dev/null
+++ b/examples/qwen_gr00t/conf/train.yaml
@@ -0,0 +1,35 @@
+defaults:
+  - _self_
+  - train: qwen_gr00t
+
+experiment:
+  exp_name: qwen_gr00t_train
+  seed: 42
+  save_steps: 10000
+  load: null
+  exp_dir: outputs/${experiment.exp_name}
+  ckpt_format: torch
+  task:
+    type: train
+    backend: native
+    entrypoint: flagscale/train/train_qwen_gr00t.py
+  runner:
+    per_node_task: false
+    no_shared_fs: false
+    rdzv_backend: static
+    hostfile: null
+  cmds:
+    before_start: echo "Starting Qwen-GR00T Training"
+  envs:
+    LOGLEVEL: "INFO"
+    # CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_VISIBLE_DEVICES: "2"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    WANDB_MODE: offline
+    OTEL_SDK_DISABLED: true
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/examples/qwen_gr00t/conf/train/qwen_gr00t.yaml b/examples/qwen_gr00t/conf/train/qwen_gr00t.yaml
new file mode 100644
index 0000000000..952885d8d2
--- /dev/null
+++ b/examples/qwen_gr00t/conf/train/qwen_gr00t.yaml
@@ -0,0 +1,168 @@
+system:
+  batch_size: 16
+  train_steps: 30000
+  log_freq: 1
+  grad_clip_norm: 1.0
+  use_amp: true
+  shuffle: true
+  num_workers: 4
+
+  optimizer:
+    name: AdamW
+    lr: 2.5e-5
+    betas: [0.9, 0.95]
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+    param_groups:
+      vlm:
+        lr: 1.0e-05
+      action_model:
+        lr: 1.0e-04
+
+  scheduler:
+    name: cosine_with_min_lr
+    warmup_steps: 5000
+    scheduler_kwargs:
+      min_lr: 1.0e-06
+    # Legacy fields kept for BC
+    decay_steps: 30000
+    decay_lr: 2.5e-6
+
+  checkpoint:
+    output_directory: ${experiment.exp_dir}
+    # Whether to save checkpoint
+    save_checkpoint: true
+    # Number of steps between checkpoints
+    save_freq: 1000
+  # TODO(yupu): Support resuming from checkpoint
+
+model:
+  # TODO: (yupu) the config layout is still a mess
+  model_name: qwen_gr00t
+  # Path to the checkpoint of the pretrained base VLM model, e.g. Qwen3-VL-4B-Instruct
+  checkpoint_dir: /share/project/fengyupu/models/Qwen/Qwen3-VL-4B-Instruct/
+  # checkpoint_dir: /workspace/models/Qwen/Qwen2.5-VL-3B-Instruct/
+  vlm:
+    type: qwen3-vl
+    # type: qwen2.5-vl
+  qwenvl:
+    base_vlm: /share/project/fengyupu/models/Qwen/Qwen3-VL-4B-Instruct/
+    # base_vlm: /workspace/models/Qwen/Qwen2.5-VL-3B-Instruct/
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+  dino:
+    dino_backbone: dinov2_vits14
+  action_model:
+    type: flow_matching
+    action_model_type: DiT-B
+    action_hidden_dim: 1024
+    hidden_size: 1024
+    add_pos_embed: True
+    max_seq_len: 1024
+    action_dim: 7
+    state_dim: 7
+    future_action_window_size: 7
+    action_horizon: 8
+    past_action_window_size: 0
+    repeated_diffusion_steps: 4
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 4
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: True
+      # # FIXME: Debug only
+      # dropout: 0
+      # final_dropout: False
+      interleave_self_attention: True
+      norm_type: ada_norm
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: None
+  reduce_in_full_precision: True
+
+  # ============================================================
+  # Module Freezing Configuration
+  # ============================================================
+  # Freezing logic: freeze_patterns are applied first, then keep_patterns override.
+  # Patterns are regex matched against full parameter names.
+  #
+  # Common patterns for QwenGR00T:
+  #   - "qwen_vl_interface\\..*"                              # Entire VLM
+  #   - "qwen_vl_interface\\.model\\.visual\\..*"             # Vision encoder
+  #   - "qwen_vl_interface\\.model\\.model\\..*"              # Language model
+  #   - "qwen_vl_interface\\.model\\.model\\.layers\\.[0-9]\\."  # LLM layers 0-9
+  #   - "action_model\\..*"                                   # Action head
+  #   - "action_model\\.model\\.transformer_blocks\\.[0-7]\\."   # DiT blocks 0-7
+  #
+  # freeze:
+  #   # SCENARIO A: Freeze VLM, train only action head
+    # freeze_patterns:
+    #   - "qwen_vl_interface\\..*"
+  #
+  #   # SCENARIO B: Freeze VLM but keep projector trainable
+  #   # freeze_patterns:
+  #   #   - "qwen_vl_interface\\..*"
+  #   # keep_patterns:
+  #   #   - "qwen_vl_interface\\.model\\.visual\\.merger\\..*"
+  #
+  #   # SCENARIO C: Freeze everything except action decoder
+  #   # freeze_patterns:
+  #   #   - ".*"
+  #   # keep_patterns:
+  #   #   - "action_model\\.action_decoder\\..*"
+
+data:
+  # TODO: (yupu) Remove this once we have a proper dataset config
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: playground/Datasets/
+    data_mix: libero_goal_old
+    action_type: delta_qpos
+    CoT_prompt: Your task is {instruction}. To identify the key objects for your task. Locate their bounding boxes in [x1,y1,x2,y2] format.
+    CoT_answer: bbox
+    default_image_resolution: [3, 224, 224]
+    load_all_data_for_training: True
+    obs: ["image_0"]
+    video_backend: torchvision_av
+  # Path to the training data
+  data_path: /share/project/fengyupu/datasets/IPEC-COMMUNITY/libero_goal_no_noops_1.0.0_lerobot/
+  tolerance_s: 0.0001
+  use_imagenet_stats: False
+  # To match the input features naming from the dataset to the policy config
+  # For example, for the aloha_mobile_cabinet dataset, the rename_map is:
+  rename_map:
+    observation.images.cam_high: observation.images.base_0_rgb
+    observation.images.cam_left_wrist: observation.images.left_wrist_0_rgb
+    observation.images.cam_right_wrist: observation.images.right_wrist_0_rgb
+  use_quantiles: false
+  # TODO: (yupu) I think these indices should belong to the policy config, maybe put it in the model config?
+  observation_delta_indices: [0]
+  action_delta_indices: [0,1,2,3,4,5,6,7]
+  preprocessor:
+    name: policy_preprocessor
+    steps:
+      - registry_name: rename_observations_processor
+        config:
+          rename_map: {}
+      - registry_name: to_batch_processor
+        config: {}
+      - registry_name: device_processor
+        config:
+          device: cuda
+          float_dtype: null
+      - registry_name: normalizer_processor
+        config:
+          eps: 1e-8
+          features: {}
+          # Only normalize first 6 action dims (x,y,z,roll,pitch,yaw).
+          # Gripper (dim 6) is left raw, matching starVLA's Libero4in1DataConfig.
+          normalize_action_dims: 6
+          norm_map:
+            VISUAL: IDENTITY
+            STATE: MIN_MAX
+            ACTION: MIN_MAX
diff --git a/examples/qwen_gr00t/download.py b/examples/qwen_gr00t/download.py
new file mode 100755
index 0000000000..48391c1e72
--- /dev/null
+++ b/examples/qwen_gr00t/download.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""
+Download models or datasets from HuggingFace Hub or ModelScope to a user-defined folder.
+
+Usage:
+    # Download model from HuggingFace
+    python download.py \
+        --repo_id lerobot/pi0_base \
+        --output_dir ~/models \
+        --source huggingface
+    # Downloads to: ~/models/lerobot/pi0_base
+
+    # Download dataset from HuggingFace
+    python download.py \
+        --repo_id lerobot/aloha_mobile_cabinet \
+        --output_dir ~/datasets \
+        --repo_type dataset \
+        --source huggingface
+    # Downloads to: ~/datasets/lerobot/aloha_mobile_cabinet
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+
+def _prepare_download(repo_id: str, output_dir: Path, repo_type: str, source_name: str) -> Path:
+    """Prepare download directory and print info.
+
+    Returns:
+        Final output directory path
+    """
+    final_output_dir = output_dir / repo_id
+    print(f"Downloading {repo_type} {repo_id} from {source_name}...")
+    print(f"Output directory: {final_output_dir}")
+    final_output_dir.mkdir(parents=True, exist_ok=True)
+    return final_output_dir
+
+
+def _handle_download_error(e: Exception, repo_id: str, source: str) -> None:
+    """Handle download errors with helpful tips."""
+    print(f"✗ Error downloading from {source}: {e}")
+    if "401" in str(e) or "authentication" in str(e).lower():
+        if source == "HuggingFace":
+            print("\nTip: You may need to set a HuggingFace token:")
+            print("  export HF_TOKEN=your_token_here")
+            print("  or run: huggingface-cli login")
+        else:
+            print("\nTip: You may need to set ModelScope credentials:")
+            print("  export MODELSCOPE_API_TOKEN=your_token_here")
+    elif "404" in str(e) or "not found" in str(e).lower():
+        print(f"\nTip: Repository '{repo_id}' not found. Check the repo ID.")
+    sys.exit(1)
+
+
+def download_from_huggingface(
+    repo_id: str,
+    output_dir: Path,
+    repo_type: str = "model",
+    revision: str | None = None,
+    token: str | None = None,
+) -> Path:
+    """Download model or dataset from HuggingFace Hub.
+
+    Args:
+        repo_id: HuggingFace repository ID (e.g., "lerobot/pi0_base")
+        output_dir: Base directory to save the repository
+            (will be saved to output_dir/repo_id)
+        repo_type: Type of repository - "model" or "dataset" (default: "model")
+        revision: Git revision (branch, tag, or commit hash). Defaults to "main"
+        token: HuggingFace token for private repos. If None, uses cached token
+
+    Returns:
+        Path to downloaded repository directory
+    """
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError:
+        print("Error: huggingface_hub is not installed.")
+        print("Install it with: pip install huggingface_hub")
+        sys.exit(1)
+
+    final_output_dir = _prepare_download(repo_id, output_dir, repo_type, "HuggingFace Hub")
+
+    try:
+        downloaded_path = snapshot_download(
+            repo_id=repo_id,
+            repo_type=repo_type,
+            revision=revision,
+            local_dir=str(final_output_dir),
+            local_dir_use_symlinks=False,
+            token=token,
+        )
+        downloaded_path = Path(downloaded_path)
+        print(f"✓ Successfully downloaded to: {downloaded_path}")
+        return downloaded_path
+    except Exception as e:
+        _handle_download_error(e, repo_id, "HuggingFace")
+        return Path()  # Never reached, but satisfies type checker
+
+
+def download_from_modelscope(
+    repo_id: str, output_dir: Path, repo_type: str = "model", revision: str | None = None
+) -> Path:
+    """Download model or dataset from ModelScope.
+
+    Args:
+        repo_id: ModelScope repository ID (e.g., "lerobot/pi0_base")
+        output_dir: Base directory to save the repository
+            (will be saved to output_dir/repo_id)
+        repo_type: Type of repository - "model" or "dataset" (default: "model")
+        revision: Git revision (branch, tag, or commit hash). Defaults to "master"
+
+    Returns:
+        Path to downloaded repository directory
+    """
+    try:
+        from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot_download
+    except ImportError:
+        try:
+            from modelscope import snapshot_download as ms_snapshot_download
+        except ImportError:
+            print("Error: modelscope is not installed.")
+            print("Install it with: pip install modelscope")
+            sys.exit(1)
+
+    final_output_dir = _prepare_download(repo_id, output_dir, repo_type, "ModelScope")
+
+    try:
+        downloaded_path = ms_snapshot_download(
+            model_id=repo_id,
+            repo_type=repo_type,
+            local_dir=str(final_output_dir),
+            revision=revision,
+        )
+        downloaded_path = Path(downloaded_path)
+        print(f"✓ Successfully downloaded to: {downloaded_path}")
+        return downloaded_path
+    except Exception as e:
+        _handle_download_error(e, repo_id, "ModelScope")
+        return Path()  # Never reached, but satisfies type checker
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download models or datasets from HuggingFace Hub or ModelScope",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Download model from HuggingFace (saves to ~/models/lerobot/pi0_base)
+  python download.py --repo_id lerobot/pi0_base \\
+      --output_dir ~/models --source huggingface
+
+  # Download dataset from HuggingFace (saves to ~/datasets/lerobot/aloha_mobile_cabinet)
+  python download.py --repo_id lerobot/aloha_mobile_cabinet \\
+      --output_dir ~/datasets --repo_type dataset --source huggingface
+
+  # Download from ModelScope (China users, saves to ~/models/lerobot/pi0_base)
+  python download.py --repo_id lerobot/pi0_base \\
+      --output_dir ~/models --source modelscope
+
+  # Download tokenizer (saves to ~/models/google/paligemma-3b-pt-224)
+  python download.py --repo_id google/paligemma-3b-pt-224 \\
+      --output_dir ~/models --source huggingface
+
+Note: For private repositories, set HF_TOKEN environment variable:
+  export HF_TOKEN=your_token_here
+        """,
+    )
+
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        required=True,
+        help="Repository ID (e.g., 'lerobot/pi0_base' or 'lerobot/aloha_mobile_cabinet')",
+    )
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help=(
+            "Base output directory (repository will be saved to output_dir/repo_id, "
+            "e.g., '~/models' -> '~/models/lerobot/pi0_base')"
+        ),
+    )
+
+    parser.add_argument(
+        "--repo_type",
+        type=str,
+        choices=["model", "dataset"],
+        default="model",
+        help="Type of repository: 'model' or 'dataset' (default: model)",
+    )
+
+    parser.add_argument(
+        "--source",
+        type=str,
+        choices=["huggingface", "modelscope"],
+        default="huggingface",
+        help="Source to download from: 'huggingface' or 'modelscope' (default: huggingface)",
+    )
+
+    args = parser.parse_args()
+
+    output_dir = Path(args.output_dir).expanduser().resolve()
+
+    if args.source == "huggingface":
+        downloaded_path = download_from_huggingface(
+            repo_id=args.repo_id,
+            output_dir=output_dir,
+            repo_type=args.repo_type,
+            revision=None,
+            token=None,
+        )
+    elif args.source == "modelscope":
+        downloaded_path = download_from_modelscope(
+            repo_id=args.repo_id, output_dir=output_dir, repo_type=args.repo_type, revision=None
+        )
+    else:
+        raise ValueError(f"Unknown source: {args.source}")
+
+    repo_type_name = "Dataset" if args.repo_type == "dataset" else "Model"
+    print(f"\n{repo_type_name} downloaded successfully to: {downloaded_path}")
+    print("You can now use this path in your config file:")
+    if args.repo_type == "dataset":
+        print(f"  data_path: {downloaded_path}")
+    else:
+        print(f"  checkpoint_dir: {downloaded_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/qwen_gr00t/dump_dataset_inputs.py b/examples/qwen_gr00t/dump_dataset_inputs.py
new file mode 100644
index 0000000000..1cb9c61acb
--- /dev/null
+++ b/examples/qwen_gr00t/dump_dataset_inputs.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python
+"""
+Extract inference inputs (images, state, task) from a LeRobotDataset.
+
+This script extracts the required inputs from a dataset sample and saves them
+in a format that can be used by the inference script.
+
+Usage:
+    # Extract from a specific frame index
+    python dump_dataset_inputs.py \
+        --dataset_root /path/to/dataset \
+        --output_dir ./inference_inputs \
+        --frame_index 100
+
+    # Extract from a specific episode and frame
+    python dump_dataset_inputs.py \
+        --dataset_root /path/to/dataset \
+        --output_dir ./inference_inputs \
+        --episode_index 0 \
+        --frame_in_episode 50
+
+    # Extract multiple samples
+    python dump_dataset_inputs.py \
+        --dataset_root /path/to/dataset \
+        --output_dir ./inference_inputs \
+        --frame_indices 100 200 300
+"""
+
+import argparse
+import json
+import os
+import sys
+
+# Add FlagScale root to sys.path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from pathlib import Path
+
+import torch
+from PIL import Image
+from torchvision.transforms import ToPILImage
+
+from flagscale.train.datasets.lerobot_dataset import LeRobotDataset
+
+
+def tensor_to_image(tensor: torch.Tensor) -> Image.Image:
+    """Convert tensor to PIL Image.
+
+    Handles different tensor formats:
+    - (C, H, W) - single image
+    - (H, W, C) - single image (channel last)
+    - (B, C, H, W) - batch, takes first
+    - (B, H, W, C) - batch, takes first
+    """
+    # Remove batch dimension if present
+    if tensor.dim() == 4:
+        tensor = tensor[0]
+
+    # Handle channel-first vs channel-last
+    if tensor.dim() == 3:
+        if tensor.shape[0] == 3 or tensor.shape[0] == 1:
+            # (C, H, W) -> (H, W, C)
+            tensor = tensor.permute(1, 2, 0)
+        # Now should be (H, W, C)
+
+    # Clamp values to [0, 1] if needed
+    if tensor.max() > 1.0:
+        tensor = tensor / 255.0
+
+    # Convert to [0, 255] uint8
+    if tensor.dtype != torch.uint8:
+        tensor = (tensor.clamp(0, 1) * 255).byte()
+
+    # Handle grayscale
+    if tensor.shape[2] == 1:
+        tensor = tensor.squeeze(2)
+
+    # Convert to PIL Image
+    to_pil = ToPILImage()
+    if tensor.shape[2] == 3:
+        # RGB
+        img = to_pil(tensor.permute(2, 0, 1))
+    else:
+        # Grayscale
+        img = Image.fromarray(tensor.numpy(), mode="L")
+
+    return img
+
+
+def extract_sample(
+    dataset: LeRobotDataset,
+    frame_index: int | None = None,
+    episode_index: int | None = None,
+    frame_in_episode: int | None = None,
+) -> dict:
+    """Extract a sample from the dataset.
+
+    Args:
+        dataset: LeRobotDataset instance
+        frame_index: Global frame index (takes precedence)
+        episode_index: Episode index (requires frame_in_episode)
+        frame_in_episode: Frame index within episode
+
+    Returns:
+        Dictionary with sample data
+    """
+    if frame_index is not None:
+        idx = frame_index
+    elif episode_index is not None and frame_in_episode is not None:
+        # Find the global index from episode and frame
+        episode_info = dataset.meta.episodes.iloc[episode_index]
+        idx = episode_info["dataset_from_index"] + frame_in_episode
+    else:
+        raise ValueError("Must provide either frame_index or (episode_index, frame_in_episode)")
+
+    if idx >= len(dataset):
+        raise ValueError(f"Index {idx} out of range (dataset has {len(dataset)} frames)")
+
+    sample = dataset[idx]
+    return sample
+
+
+def dump_sample(
+    sample: dict,
+    output_dir: Path,
+    sample_name: str = "sample",
+    image_format: str = "jpg",
+    dataset=None,
+) -> dict:
+    """Save sample data to files.
+
+    Args:
+        sample: Sample dictionary from dataset
+        output_dir: Directory to save files
+        sample_name: Base name for output files
+        image_format: Image format ('jpg' or 'png')
+
+    Returns:
+        Dictionary with paths to saved files
+    """
+    saved_paths = {"images": {}, "state": None, "task": None}
+
+    # TODO: A little bit hacky
+    image_keys = [k for k in sample.keys() if "images" in k]
+    print(f"Found {len(image_keys)} image key(s): {image_keys}")
+
+    for img_key in image_keys:
+        img_tensor = sample[img_key]
+        img = tensor_to_image(img_tensor)
+
+        filename = img_key.replace(".", "_")
+        img_path = output_dir / f"{sample_name}_{filename}.{image_format}"
+
+        img.save(img_path)
+        print(f"Saved image: {img_path}")
+        saved_paths["images"][img_key] = str(img_path)
+
+    # Extract and save state
+    state_keys = [k for k in sample.keys() if "state" in k and "images" not in k]
+    if state_keys:
+        state_key = state_keys[0]  # Use first state key
+        state_tensor = sample[state_key]
+
+        # Ensure it's 2D (batch, dim)
+        if state_tensor.dim() == 1:
+            state_tensor = state_tensor.unsqueeze(0)
+
+        state_path = output_dir / f"{sample_name}_state.pt"
+        torch.save(state_tensor, state_path)
+        print(f"Saved state: {state_path} (shape: {state_tensor.shape})")
+        saved_paths["state"] = str(state_path)
+    else:
+        print("Warning: No state found in sample")
+
+    # Extract and save task
+    if "task" in sample:
+        task = sample["task"]
+        if isinstance(task, torch.Tensor):
+            task = task.item() if task.numel() == 1 else str(task.tolist())
+        elif isinstance(task, list) and len(task) > 0:
+            task = task[0] if isinstance(task[0], str) else str(task[0])
+
+        task_path = output_dir / f"{sample_name}_task.txt"
+        with open(task_path, "w", encoding="utf-8") as f:
+            f.write(str(task))
+        print(f"Saved task: {task_path} (content: '{task}')")
+        saved_paths["task"] = str(task_path)
+    elif "task_index" in sample:
+        # Try to get task from task_index
+        task_idx = sample["task_index"]
+        if isinstance(task_idx, torch.Tensor):
+            task_idx = task_idx.item()
+
+        # Get task from dataset metadata
+        if dataset is not None and hasattr(dataset, "meta") and hasattr(dataset.meta, "tasks"):
+            tasks_df = dataset.meta.tasks
+            if task_idx < len(tasks_df):
+                task = tasks_df.iloc[task_idx]["task"]
+                task_path = output_dir / f"{sample_name}_task.txt"
+                with open(task_path, "w", encoding="utf-8") as f:
+                    f.write(str(task))
+                print(f"Saved task: {task_path} (content: '{task}')")
+                saved_paths["task"] = str(task_path)
+    else:
+        print("Warning: No task found in sample")
+
+    return saved_paths
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Extract inference inputs from LeRobotDataset")
+    parser.add_argument(
+        "--dataset_root", type=str, default=None, help="Local dataset root directory"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, required=True, help="Output directory to save extracted files"
+    )
+    parser.add_argument(
+        "--frame_index", type=int, default=None, help="Global frame index to extract"
+    )
+    parser.add_argument(
+        "--episode_index",
+        type=int,
+        default=None,
+        help="Episode index (requires --frame_in_episode)",
+    )
+    parser.add_argument(
+        "--frame_in_episode",
+        type=int,
+        default=None,
+        help="Frame index within episode (requires --episode_index)",
+    )
+    parser.add_argument(
+        "--frame_indices",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Multiple frame indices to extract",
+    )
+    parser.add_argument(
+        "--image_format",
+        type=str,
+        default="jpg",
+        choices=["jpg", "png"],
+        help="Image format to save",
+    )
+    parser.add_argument(
+        "--video_backend",
+        type=str,
+        default="pyav",
+        choices=["pyav", "torchcodec", "video_reader"],
+        help="Video backend to use (default: pyav, more reliable than torchcodec)",
+    )
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = get_args()
+
+    # Create output directory early
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load dataset
+    print(f"Loading dataset: {args.dataset_root}")
+    dataset = LeRobotDataset(root=args.dataset_root, video_backend=args.video_backend)
+    print(f"Dataset loaded: {len(dataset)} frames, {dataset.num_episodes} episodes")
+
+    # Determine which samples to extract
+    if args.frame_indices:
+        indices = args.frame_indices
+        sample_names = [f"frame_{idx}" for idx in indices]
+    elif args.frame_index is not None:
+        indices = [args.frame_index]
+        sample_names = [f"frame_{args.frame_index}"]
+    elif args.episode_index is not None and args.frame_in_episode is not None:
+        # Calculate global index
+        episode_info = dataset.meta.episodes[args.episode_index]
+        global_idx = episode_info["dataset_from_index"] + args.frame_in_episode
+        indices = [global_idx]
+        sample_names = [f"episode_{args.episode_index}_frame_{args.frame_in_episode}"]
+    else:
+        raise ValueError(
+            "Must provide --frame_index, --frame_indices, or (--episode_index + --frame_in_episode)"
+        )
+
+    # Extract and save samples
+    all_paths = []
+
+    for idx, sample_name in zip(indices, sample_names, strict=False):
+        print(f"\n{'=' * 60}")
+        print(f"Extracting sample {idx} ({sample_name})")
+        print(f"{'=' * 60}")
+
+        sample = extract_sample(dataset, frame_index=idx)
+        paths = dump_sample(sample, output_dir, sample_name, args.image_format, dataset=dataset)
+        all_paths.append({"index": idx, "sample_name": sample_name, "paths": paths})
+
+    summary_path = output_dir / "extraction_summary.json"
+    with open(summary_path, "w", encoding="utf-8") as f:
+        json.dump(all_paths, f, indent=2)
+    print(f"Extraction complete! Summary saved to: {summary_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/qwen_gr00t/run_client.sh b/examples/qwen_gr00t/run_client.sh
new file mode 100755
index 0000000000..eabf779a32
--- /dev/null
+++ b/examples/qwen_gr00t/run_client.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Script to run the pi0 client using paths from examples/pi0/conf/inference/pi0.yaml
+
+set -e
+
+# Values from examples/pi0/conf/inference/pi0.yaml
+BASE_IMG="/share/project/fengyupu/github/FlagScale/inference_inputs/frame_100_observation_images_cam_high.jpg"
+LEFT_WRIST_IMG="/share/project/fengyupu/github/FlagScale/inference_inputs/frame_100_observation_images_cam_left_wrist.jpg"
+RIGHT_WRIST_IMG="/share/project/fengyupu/github/FlagScale/inference_inputs/frame_100_observation_images_cam_right_wrist.jpg"
+STATE_PATH="/share/project/fengyupu/github/FlagScale/inference_inputs/frame_100_state.pt"
+TASK_PATH="/share/project/fengyupu/github/FlagScale/inference_inputs/frame_100_task.txt"
+
+# Server settings
+HOST="${1:-127.0.0.1}"
+PORT="${2:-5000}"
+
+# Read instruction from task file
+INSTRUCTION=$(cat "$TASK_PATH")
+
+# Run the client
+python examples/pi0/client_pi0.py \
+  --host "$HOST" \
+  --port "$PORT" \
+  --img1 "$BASE_IMG" \
+  --img2 "$LEFT_WRIST_IMG" \
+  --img3 "$RIGHT_WRIST_IMG" \
+  --state-path "$STATE_PATH" \
+  --instruction "$INSTRUCTION"
diff --git a/flagscale/logger.py b/flagscale/logger.py
index 738dbcb8dd..61dcf26a1c 100644
--- a/flagscale/logger.py
+++ b/flagscale/logger.py
@@ -22,19 +22,19 @@ def __init__(self, name, level=logging.INFO):
         self.logger.addHandler(stream_handler)
 
     def info(self, message):
-        self.logger.info(message)
+        self.logger.info(message, stacklevel=2)
 
     def warning(self, message):
-        self.logger.warning(message)
+        self.logger.warning(message, stacklevel=2)
 
     def error(self, message):
-        self.logger.error(message)
+        self.logger.error(message, stacklevel=2)
 
     def critical(self, message):
-        self.logger.critical(message)
+        self.logger.critical(message, stacklevel=2)
 
     def debug(self, message):
-        self.logger.debug(message)
+        self.logger.debug(message, stacklevel=2)
 
 
 GLOBAL_LOGGER = None
diff --git a/flagscale/models/action_model/__init__.py b/flagscale/models/action_model/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/flagscale/models/action_model/flow_matching_head/__init__.py b/flagscale/models/action_model/flow_matching_head/__init__.py
new file mode 100644
index 0000000000..3159bfe656
--- /dev/null
+++ b/flagscale/models/action_model/flow_matching_head/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/flagscale/models/action_model/flow_matching_head/action_encoder.py b/flagscale/models/action_model/flow_matching_head/action_encoder.py
new file mode 100644
index 0000000000..2b005c6be6
--- /dev/null
+++ b/flagscale/models/action_model/flow_matching_head/action_encoder.py
@@ -0,0 +1,105 @@
+# Mainly adopted from:
+# https://github.com/starVLA/starVLA/blob/3f7feefbc5fc25890ad3a7d262b8a0aea1339aa7/starVLA/model/modules/action_model/flow_matching_head/action_encoder.py
+# Below is the original copyright:
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+class SinusoidalPositionalEncoding(nn.Module):
+    """
+    Produces a sinusoidal encoding of shape (B, T, w)
+    given timesteps of shape (B, T).
+    """
+
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+
+    def forward(self, timesteps):
+        # timesteps: shape (B, T)
+        # We'll compute sin/cos frequencies across dim T
+        timesteps = timesteps.float()  # ensure float
+
+        B, T = timesteps.shape
+        device = timesteps.device
+
+        half_dim = self.embedding_dim // 2
+        # typical log space frequencies for sinusoidal encoding
+        exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * (
+            torch.log(torch.tensor(10000.0)) / half_dim
+        )
+        # Expand timesteps to (B, T, 1) then multiply
+        freqs = timesteps.unsqueeze(-1) * exponent.exp()  # (B, T, half_dim)
+
+        sin = torch.sin(freqs)
+        cos = torch.cos(freqs)
+        enc = torch.cat([sin, cos], dim=-1)  # (B, T, w)
+
+        return enc
+
+
+class ActionEncoder(nn.Module):
+    def __init__(self, action_dim, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # W1: R^{w x d}, W2: R^{w x 2w}, W3: R^{w x w}
+        self.W1 = nn.Linear(action_dim, hidden_size)  # (d -> w)
+        self.W2 = nn.Linear(2 * hidden_size, hidden_size)  # (2w -> w)
+        self.W3 = nn.Linear(hidden_size, hidden_size)  # (w -> w)
+
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+
+    def forward(self, actions, timesteps):
+        """
+        actions:   shape (B, T, action_dim)
+        timesteps: shape (B,)  -- a single scalar per batch item
+        returns:   shape (B, T, hidden_size)
+        """
+        B, T, _ = actions.shape
+
+        # 1) Expand each batch's single scalar time 'tau' across all T steps
+        #    so that shape => (B, T)
+        #    e.g. if timesteps is (B,), replicate across T
+        if timesteps.dim() == 1 and timesteps.shape[0] == B:
+            # shape (B,) => (B,T)
+            timesteps = timesteps.unsqueeze(1).expand(-1, T)
+        else:
+            raise ValueError(
+                "Expected `timesteps` to have shape (B,) so we can replicate across T."
+            )
+
+        # 2) Standard action MLP step for shape => (B, T, w)
+        a_emb = self.W1(actions)
+
+        # 3) Get the sinusoidal encoding (B, T, w)
+        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
+
+        # 4) Concat along last dim => (B, T, 2w), then W2 => (B, T, w), swish
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = swish(self.W2(x))
+
+        # 5) Finally W3 => (B, T, w)
+        x = self.W3(x)
+
+        return x
diff --git a/flagscale/models/action_model/flow_matching_head/cross_attention_dit.py b/flagscale/models/action_model/flow_matching_head/cross_attention_dit.py
new file mode 100755
index 0000000000..3da618f5bd
--- /dev/null
+++ b/flagscale/models/action_model/flow_matching_head/cross_attention_dit.py
@@ -0,0 +1,378 @@
+# Mainly adopted from:
+# https://github.com/starVLA/starVLA/blob/3f7feefbc5fc25890ad3a7d262b8a0aea1339aa7/starVLA/model/modules/action_model/flow_matching_head/cross_attention_dit.py
+# Below is the original copyright:
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import torch.nn.functional as F
+from diffusers import ConfigMixin, ModelMixin
+from diffusers.configuration_utils import register_to_config
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.embeddings import (
+    SinusoidalPositionalEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from torch import nn
+
+
+class TimestepEncoder(nn.Module):
+    def __init__(self, embedding_dim, compute_dtype=torch.float32):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+    def forward(self, timesteps):
+        dtype = next(self.parameters()).dtype
+        timesteps_proj = self.time_proj(timesteps).to(dtype)
+        timesteps_emb = self.timestep_embedder(timesteps_proj)  # (N, D)
+        return timesteps_emb
+
+
+class AdaLayerNorm(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-5,
+        chunk_dim: int = 0,
+    ):
+        super().__init__()
+        self.chunk_dim = chunk_dim
+        output_dim = embedding_dim * 2
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, output_dim)
+        self.norm = nn.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        temb: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        temb = self.linear(self.silu(temb))
+        scale, shift = temb.chunk(2, dim=1)
+        x = self.norm(x) * (1 + scale[:, None]) + shift[:, None]
+        return x
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: int | None = None,
+        activation_fn: str = "geglu",
+        attention_bias: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: str | None = None,
+        num_positional_embeddings: int | None = None,
+        ff_inner_dim: int | None = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.norm_type = norm_type
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positional_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(
+                dim, max_seq_length=num_positional_embeddings
+            )
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+        if final_dropout:
+            self.final_dropout = nn.Dropout(dropout)
+        else:
+            self.final_dropout = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        encoder_hidden_states: torch.Tensor | None = None,
+        encoder_attention_mask: torch.Tensor | None = None,
+        temb: torch.LongTensor | None = None,
+    ) -> torch.Tensor:
+        # 0. Self-Attention
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, temb)
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask,  # @JinhuiYE original attention_mask=attention_mask
+        )
+        if self.final_dropout:
+            attn_output = self.final_dropout(attn_output)
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        return hidden_states
+
+
+class DiT(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+
+    # register_to_config 的作用是创建类的时候会自动把传入的参数注册到 config 中，这样后续调用的时候可以通过 self.config.xxx 调用 还不是 self.xxx
+    @register_to_config  # 去看一下这个的作用 --> 将传入的参数注册到配置中 TODO 改为我们的单例模式, 写一个 能够merge 的 @merge_pram_config
+    def __init__(
+        self,
+        num_attention_heads: int = 8,
+        attention_head_dim: int = 64,
+        output_dim: int = 26,
+        num_layers: int = 12,
+        dropout: float = 0.1,
+        attention_bias: bool = True,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: int | None = 1000,
+        upcast_attention: bool = False,
+        norm_type: str = "ada_norm",
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-5,
+        max_num_positional_embeddings: int = 512,
+        compute_dtype=torch.float32,
+        final_dropout: bool = True,
+        positional_embeddings: str | None = "sinusoidal",
+        interleave_self_attention=False,
+        cross_attention_dim: int | None = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.gradient_checkpointing = False
+
+        # Timestep encoder
+        #  self.config.compute_dtype 可能不存在，要提前处理
+        compute_dtype = getattr(self.config, "compute_dtype", torch.float32)
+        self.timestep_encoder = TimestepEncoder(  # TODO BUG, train 的时候 self.config.compute_dtype 不会报错， 但是 eval 的时候会
+            embedding_dim=self.inner_dim, compute_dtype=compute_dtype
+        )
+
+        all_blocks = []
+        for idx in range(self.config.num_layers):
+            use_self_attn = idx % 2 == 1 and interleave_self_attention
+            curr_cross_attention_dim = cross_attention_dim if not use_self_attn else None
+
+            all_blocks += [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    activation_fn=self.config.activation_fn,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=self.config.max_num_positional_embeddings,
+                    final_dropout=final_dropout,
+                    cross_attention_dim=curr_cross_attention_dim,
+                )
+            ]
+        self.transformer_blocks = nn.ModuleList(all_blocks)
+
+        # Output blocks
+        self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+        self.proj_out_2 = nn.Linear(self.inner_dim, self.config.output_dim)
+        print(
+            "Total number of DiT parameters: ",
+            sum(p.numel() for p in self.parameters() if p.requires_grad),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,  # Shape: (B, T, D)
+        encoder_hidden_states: torch.Tensor,  # Shape: (B, S, D)
+        timestep: torch.LongTensor | None = None,
+        return_all_hidden_states: bool = False,
+        encoder_attention_mask=None,
+    ):
+        # Encode timesteps
+        temb = self.timestep_encoder(timestep)
+
+        # Process through transformer blocks - single pass through the blocks
+        hidden_states = hidden_states.contiguous()
+        encoder_hidden_states = encoder_hidden_states.contiguous()
+
+        all_hidden_states = [hidden_states]
+
+        # Process through transformer blocks
+        for idx, block in enumerate(self.transformer_blocks):
+            if idx % 2 == 1 and self.config.interleave_self_attention:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=None,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    temb=temb,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=None,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    temb=temb,
+                )
+            all_hidden_states.append(hidden_states)
+
+        # Output processing
+        conditioning = temb
+        shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+        hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+        if return_all_hidden_states:
+            return self.proj_out_2(hidden_states), all_hidden_states
+        else:
+            return self.proj_out_2(hidden_states)
+
+
+class SelfAttentionTransformer(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 8,
+        attention_head_dim: int = 64,
+        output_dim: int = 26,
+        num_layers: int = 12,
+        dropout: float = 0.1,
+        attention_bias: bool = True,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: int | None = 1000,
+        upcast_attention: bool = False,
+        max_num_positional_embeddings: int = 512,
+        compute_dtype=torch.float32,
+        final_dropout: bool = True,
+        positional_embeddings: str | None = "sinusoidal",
+        interleave_self_attention=False,
+    ):
+        super().__init__()
+
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.gradient_checkpointing = False
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    activation_fn=self.config.activation_fn,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=self.config.max_num_positional_embeddings,
+                    final_dropout=final_dropout,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+        print(
+            "Total number of SelfAttentionTransformer parameters: ",
+            sum(p.numel() for p in self.parameters() if p.requires_grad),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,  # Shape: (B, T, D)
+        return_all_hidden_states: bool = False,
+    ):
+        # Process through transformer blocks - single pass through the blocks
+        hidden_states = hidden_states.contiguous()
+        all_hidden_states = [hidden_states]
+
+        # Process through transformer blocks
+        for idx, block in enumerate(self.transformer_blocks):
+            hidden_states = block(hidden_states)
+            all_hidden_states.append(hidden_states)
+
+        if return_all_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
diff --git a/flagscale/models/action_model/gr00t_action_header.py b/flagscale/models/action_model/gr00t_action_header.py
new file mode 100644
index 0000000000..a0a6b3d395
--- /dev/null
+++ b/flagscale/models/action_model/gr00t_action_header.py
@@ -0,0 +1,443 @@
+# Mainly adopted from:
+# https://github.com/starVLA/starVLA/blob/3f7feefbc5fc25890ad3a7d262b8a0aea1339aa7/starVLA/model/modules/action_model/GR00T_ActionHeader.py
+# Below is the original copyright:
+
+# Copyright 2025 NVIDIA Corp. and affiliates. All rights reserved.
+# Modified by [Junqiu YU/ Fudan University] in [2025].
+# Modification: [rm and add some connect adapter to match with starVLA, e.g., "rm "].
+# Action repeat is inspired by CogACT
+
+
+from dataclasses import dataclass, field
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.distributions import Beta
+from transformers import PretrainedConfig
+from transformers.feature_extraction_utils import BatchFeature
+
+from flagscale.models.action_model.flow_matching_head.action_encoder import (
+    SinusoidalPositionalEncoding,
+    swish,
+)
+from flagscale.models.action_model.flow_matching_head.cross_attention_dit import DiT
+
+# TODO try to merge DiT Modules with follow_match_head, they are just the same arch, but diff loss, use diffusers package will be simple
+
+
+class CategorySpecificLinear(nn.Module):
+    def __init__(self, num_categories, input_dim, hidden_dim):
+        super().__init__()
+        self.num_categories = num_categories
+        # For each category, we have separate weights and biases.
+        self.W = nn.Parameter(0.02 * torch.randn(num_categories, input_dim, hidden_dim))
+        self.b = nn.Parameter(torch.zeros(num_categories, hidden_dim))
+
+    def forward(self, x, cat_ids):
+        selected_W = self.W[cat_ids]
+        selected_b = self.b[cat_ids]
+        # import ipdb; ipdb.set_trace()
+        return torch.bmm(x, selected_W) + selected_b.unsqueeze(1)
+
+
+class CategorySpecificMLP(nn.Module):
+    def __init__(self, num_categories, input_dim, hidden_dim, output_dim):
+        super().__init__()
+        self.num_categories = num_categories
+        self.layer1 = CategorySpecificLinear(num_categories, input_dim, hidden_dim)
+        self.layer2 = CategorySpecificLinear(num_categories, hidden_dim, output_dim)
+
+    def forward(self, x, cat_ids):
+        hidden = F.relu(self.layer1(x, cat_ids))
+        return self.layer2(hidden, cat_ids)
+
+
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim):
+        super().__init__()
+        self.layer1 = nn.Linear(input_dim, hidden_dim)
+        self.layer2 = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, x):
+        return self.layer2(F.relu(self.layer1(x)))
+
+
+class ActionEncoder(nn.Module):
+    def __init__(self, action_dim, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.action_dim = action_dim
+        self.layer1 = nn.Linear(action_dim, hidden_size)
+        self.layer2 = nn.Linear(2 * hidden_size, hidden_size)
+        self.layer3 = nn.Linear(hidden_size, hidden_size)
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+
+    def forward(self, actions, timesteps):
+        """
+        actions:   shape (B, T, action_dim)
+        timesteps: shape (B,)  -- a single scalar per batch item
+        returns:   shape (B, T, hidden_size)
+        """
+        B, T, _ = actions.shape
+
+        # 1) Expand each batch's single scalar time 'tau' across all T steps
+        #    so that shape => (B, T)
+        #    e.g. if timesteps is (B,), replicate across T
+        if timesteps.dim() == 1 and timesteps.shape[0] == B:
+            # shape (B,) => (B,T)
+            timesteps = timesteps.unsqueeze(1).expand(-1, T)
+        else:
+            raise ValueError(
+                "Expected `timesteps` to have shape (B,) so we can replicate across T."
+            )
+
+        # 2) Standard action MLP step for shape => (B, T, w)
+        a_emb = self.layer1(actions)
+
+        # 3) Get the sinusoidal encoding (B, T, w)
+        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
+
+        # 4) Concat along last dim => (B, T, 2w), then layer2 => (B, T, w), swish
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = swish(self.layer2(x))
+
+        # 5) Finally W3 => (B, T, w)
+        x = self.layer3(x)
+        return x
+
+
+class MultiEmbodimentActionEncoder(nn.Module):
+    def __init__(self, action_dim, hidden_size, num_embodiments):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_embodiments = num_embodiments
+
+        # W1: R^{w x d}, W2: R^{w x 2w}, W3: R^{w x w}
+        self.W1 = CategorySpecificLinear(num_embodiments, action_dim, hidden_size)  # (d -> w)
+        self.W2 = CategorySpecificLinear(num_embodiments, 2 * hidden_size, hidden_size)  # (2w -> w)
+        self.W3 = CategorySpecificLinear(num_embodiments, hidden_size, hidden_size)  # (w -> w)
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+
+    def forward(self, actions, timesteps, cat_ids):
+        """
+        actions:   shape (B, T, action_dim)
+        timesteps: shape (B,)  -- a single scalar per batch item
+        cat_ids:   shape (B,)
+        returns:   shape (B, T, hidden_size)
+        """
+        B, T, _ = actions.shape
+
+        # 1) Expand each batch's single scalar time 'tau' across all T steps
+        #    so that shape => (B, T)
+        #    e.g. if timesteps is (B,), replicate across T
+        if timesteps.dim() == 1 and timesteps.shape[0] == B:
+            # shape (B,) => (B,T)
+            timesteps = timesteps.unsqueeze(1).expand(-1, T)
+        else:
+            raise ValueError(
+                "Expected `timesteps` to have shape (B,) so we can replicate across T."
+            )
+
+        # 2) Standard action MLP step for shape => (B, T, w)
+        a_emb = self.W1(actions, cat_ids)
+
+        # 3) Get the sinusoidal encoding (B, T, w)
+        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
+
+        # 4) Concat along last dim => (B, T, 2w), then W2 => (B, T, w), swish
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = swish(self.W2(x, cat_ids))
+
+        # 5) Finally W3 => (B, T, w)
+        x = self.W3(x, cat_ids)
+        return x
+
+
+@dataclass
+class FlowmatchingActionHeadConfig(PretrainedConfig):
+    """NOTE: N1.5 uses XEmbFlowmatchingPolicyHeadConfig as action head"""
+
+    add_pos_embed: bool = field(
+        default=True, metadata={"help": "Whether to add positional embedding"}
+    )
+    diffusion_model_cfg: dict = field(
+        default=None, metadata={"help": "Diffusion model configuration."}
+    )
+    input_embedding_dim: int = field(
+        default=1536, metadata={"help": "Input embedding channel dimension."}
+    )
+
+    hidden_size: int = field(default=1024, metadata={"help": "Input embedding dimension."})
+    max_seq_len: int = field(default=1024, metadata={"help": "Maximum Sequence Length"})
+    action_dim: int = field(default=None, metadata={"help": "Action dimension."})
+    action_horizon: int = field(default=None, metadata={"help": "Action horizon."})
+    noise_beta_alpha: float = field(default=1.5, metadata={"help": ""})
+    noise_beta_beta: float = field(default=1.0, metadata={"help": ""})
+    noise_s: float = field(
+        default=0.999, metadata={"help": "Flow matching noise Beta distribution s."}
+    )
+    num_timestep_buckets: int = field(
+        default=1000, metadata={"help": "Number of timestep discretization buckets."}
+    )
+    num_inference_timesteps: int = field(
+        default=None,
+        metadata={"help": "Number of inference steps for noise diffusion."},
+    )
+    max_num_embodiments: int = field(default=32, metadata={"help": "Number of embodiments."})
+    tune_projector: bool = field(default=True, metadata={"help": "Whether to tune the projector."})
+    tune_diffusion_model: bool = field(
+        default=True, metadata={"help": "Whether to tune the diffusion model."}
+    )
+    load_pretrained_det_decode_layer_path: str = field(
+        default=None, metadata={"help": "Path to pretrained detection model."}
+    )
+    detection_coeff: float = field(default=1.0, metadata={"help": "Detection coefficient."})
+
+    freeze_decode_layer: bool = field(default=False)
+    expand_batch: int = field(default=None)
+    use_vlln: bool = field(default=True)
+
+    vl_self_attention_cfg: dict = field(default=None)
+    num_target_vision_tokens: int = field(
+        default=32, metadata={"help": "Number of target vision tokens."}
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+
+DiTConfig = {
+    "DiT-B": {"input_embedding_dim": 768, "attention_head_dim": 64, "num_attention_heads": 12},
+    "DiT-L": {"input_embedding_dim": 1536, "attention_head_dim": 48, "num_attention_heads": 32},
+}
+
+
+class FlowmatchingActionHead(nn.Module):
+    def __init__(
+        self,
+        full_config,
+    ):
+        super().__init__()
+        config = full_config.model.action_model
+        self.hidden_size = config.hidden_size  # @JinhuiYE
+        self.full_config = full_config
+        action_model_type = config.action_model_type
+        action_model_cfg = DiTConfig[action_model_type]
+
+        self.input_embedding_dim = action_model_cfg["input_embedding_dim"]
+        diffusion_model_cfg = config.diffusion_model_cfg
+        diffusion_model_cfg = {**action_model_cfg, **diffusion_model_cfg}
+        print(
+            f"[DEBUG RNG ActionHead] Before DiT: state[:10] = {torch.get_rng_state()[:10].tolist()}"
+        )
+        self.model = DiT(**diffusion_model_cfg)
+        print(
+            f"[DEBUG RNG ActionHead] After DiT: state[:10] = {torch.get_rng_state()[:10].tolist()}"
+        )
+        self.action_dim = config.action_dim
+        self.action_horizon = config.future_action_window_size + 1
+        self.num_inference_timesteps = config.num_inference_timesteps
+
+        self.state_encoder = (
+            MLP(
+                input_dim=config.state_dim,
+                hidden_dim=self.hidden_size,
+                output_dim=self.input_embedding_dim,
+            )
+            if config.state_dim
+            else None
+        )
+        print(
+            f"[DEBUG RNG ActionHead] After state_encoder: state[:10] = {torch.get_rng_state()[:10].tolist()}"
+        )
+
+        self.action_encoder = ActionEncoder(
+            action_dim=config.action_dim,
+            hidden_size=self.input_embedding_dim,
+        )
+        print(
+            f"[DEBUG RNG ActionHead] After action_encoder: state[:10] = {torch.get_rng_state()[:10].tolist()}"
+        )
+        self.action_decoder = MLP(
+            input_dim=self.model.config.output_dim,
+            hidden_dim=self.hidden_size,
+            output_dim=self.action_dim,
+        )
+        print(
+            f"[DEBUG RNG ActionHead] After action_decoder: state[:10] = {torch.get_rng_state()[:10].tolist()}"
+        )
+        self.future_tokens = nn.Embedding(config.num_target_vision_tokens, self.input_embedding_dim)
+        nn.init.normal_(self.future_tokens.weight, mean=0.0, std=0.02)
+        print(
+            f"[DEBUG RNG ActionHead] After future_tokens: state[:10] = {torch.get_rng_state()[:10].tolist()}"
+        )
+
+        if config.add_pos_embed:
+            self.position_embedding = nn.Embedding(config.max_seq_len, self.input_embedding_dim)
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+            print(
+                f"[DEBUG RNG ActionHead] After position_embedding: state[:10] = {torch.get_rng_state()[:10].tolist()}"
+            )
+
+        self.beta_dist = Beta(config.noise_beta_alpha, config.noise_beta_beta)
+        self.num_timestep_buckets = config.num_timestep_buckets
+        self.config = config
+
+    def sample_time(self, batch_size, device, dtype):
+        sample = self.beta_dist.sample([batch_size]).to(device, dtype=dtype)
+        return (self.config.noise_s - sample) / self.config.noise_s
+
+    def prepare_input(self, batch: dict) -> BatchFeature:
+        return BatchFeature(data=batch)
+
+    def forward(
+        self,
+        vl_embs: torch.Tensor,
+        actions: torch.Tensor,
+        state: torch.Tensor = None,
+        encoder_attention_mask=None,
+    ):
+        """
+        vl_embs: shape (B, seq_length, feature_dim)
+        actions: shape (B, future_action_window_size, D_action)
+        """
+        device = vl_embs.device
+
+        # Validate action dimension
+        if actions.shape[-1] != self.action_dim:
+            raise ValueError(
+                f"Action dimension mismatch: model expects {self.action_dim} dimensions "
+                f"(from config), but received actions with {actions.shape[-1]} dimensions. "
+                f"Please update config.model.action_model.action_dim to match your data."
+            )
+        # # DEBUG: deterministic timesteps for alignment verification
+        # torch.manual_seed(42)
+        # torch.cuda.manual_seed(42)
+
+        # DEBUG: Print input shapes and stats
+        print(f"[ACTION HEAD] vl_embs shape: {vl_embs.shape}, norm: {vl_embs.norm().item():.4f}")
+        print(f"[ACTION HEAD] actions shape: {actions.shape}, norm: {actions.norm().item():.4f}")
+
+        # Embed noised action trajectory.
+        noise = torch.randn(actions.shape, device=actions.device, dtype=actions.dtype)
+
+        t = self.sample_time(actions.shape[0], device=actions.device, dtype=actions.dtype)
+        t = t[:, None, None]  # shape (B,1,1) for broadcast
+
+        print(f"[ACTION HEAD] noise norm: {noise.norm().item():.4f}, t[0]: {t[0, 0, 0].item():.6f}")
+        print(f"[ACTION HEAD] noise[0,0,:3]: {noise[0, 0, :3].tolist()}")
+        print(f"[ACTION HEAD] t[:4]: {t[:4, 0, 0].tolist()}")
+
+        noisy_trajectory = (1 - t) * noise + t * actions
+        velocity = actions - noise
+
+        print(f"[ACTION HEAD] noisy_trajectory norm: {noisy_trajectory.norm().item():.4f}")
+        print(f"[ACTION HEAD] velocity norm: {velocity.norm().item():.4f}")
+
+        # Convert (continuous) t -> discrete if needed
+        t_discretized = (t[:, 0, 0] * self.num_timestep_buckets).long()
+        action_features = self.action_encoder(noisy_trajectory, t_discretized)
+        print(f"[ACTION HEAD] action_features norm: {action_features.norm().item():.4f}")
+
+        # embed state
+        state_features = self.state_encoder(state) if state is not None else None
+
+        # Maybe add position embedding.
+        if self.config.add_pos_embed:
+            pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+            pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+            action_features = action_features + pos_embs
+
+        # state and action embedding along sequence dimension.
+        future_tokens = self.future_tokens.weight.unsqueeze(0).expand(vl_embs.shape[0], -1, -1)
+        sa_embs = (
+            torch.cat((state_features, future_tokens, action_features), dim=1)
+            if state_features is not None
+            else torch.cat((future_tokens, action_features), dim=1)
+        )
+
+        # Join VLM features with state and action embedding along sequence dimension.
+        print(f"[ACTION HEAD] sa_embs shape: {sa_embs.shape}, norm: {sa_embs.norm().item():.4f}")
+        model_output = self.model(
+            hidden_states=sa_embs,
+            encoder_hidden_states=vl_embs,
+            encoder_attention_mask=encoder_attention_mask,
+            timestep=t_discretized,
+            return_all_hidden_states=False,  # NOTE (YL): not using flare now
+        )
+        print(f"[ACTION HEAD] model_output norm: {model_output.norm().item():.4f}")
+        pred = self.action_decoder(model_output)
+        pred_actions = pred[:, -actions.shape[1] :]
+
+        print(f"[ACTION HEAD] pred_actions norm: {pred_actions.norm().item():.4f}")
+        print(f"[ACTION HEAD] pred_actions[0,0,:5]: {pred_actions[0, 0, :5].tolist()}")
+
+        # Slice out only the action portion of pred and target.
+        loss = ((pred_actions - velocity) ** 2).mean()
+        print(f"[ACTION HEAD] loss: {loss.item():.6f}")
+        return loss
+
+    @torch.no_grad()
+    def predict_action(self, vl_embs: torch.Tensor, state: torch.Tensor = None) -> torch.Tensor:
+        # Set initial actions as the sampled noise.
+        batch_size = vl_embs.shape[0]
+        device = vl_embs.device
+        actions = torch.randn(  # yes, here make sure action_horizon align with data loader? or share from client?
+            size=(batch_size, self.config.action_horizon, self.config.action_dim),
+            dtype=vl_embs.dtype,
+            device=device,
+        )
+
+        num_steps = self.num_inference_timesteps
+        dt = 1.0 / num_steps
+
+        state_features = self.state_encoder(state) if state is not None else None
+
+        # Run denoising steps.
+        for t in range(num_steps):
+            t_cont = t / float(num_steps)  # e.g. goes 0, 1/N, 2/N, ...
+            t_discretized = int(t_cont * self.num_timestep_buckets)
+
+            # Embed noised action trajectory.
+            timesteps_tensor = torch.full(
+                size=(batch_size,), fill_value=t_discretized, device=device
+            )
+            action_features = self.action_encoder(actions, timesteps_tensor)
+            # Maybe add position embedding.
+            if self.config.add_pos_embed:
+                pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+                pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+                action_features = action_features + pos_embs
+
+            # Join vision, language, state and action embedding along sequence dimension.
+            future_tokens = self.future_tokens.weight.unsqueeze(0).expand(vl_embs.shape[0], -1, -1)
+            sa_embs = (
+                torch.cat((state_features, future_tokens, action_features), dim=1)
+                if state_features is not None
+                else torch.cat((future_tokens, action_features), dim=1)
+            )
+
+            # Run model forward.
+            model_output = self.model(
+                hidden_states=sa_embs,
+                encoder_hidden_states=vl_embs,
+                timestep=timesteps_tensor,
+            )
+            pred = self.action_decoder(model_output)
+
+            pred_velocity = pred[:, -self.action_horizon :]
+
+            # Update actions using euler integration.
+            actions = actions + dt * pred_velocity
+        return actions
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    @property
+    def dtype(self):
+        return next(iter(self.parameters())).dtype
diff --git a/flagscale/models/qwen_pi/__init__.py b/flagscale/models/qwen_pi/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/flagscale/models/robobrain_x/groot_action_header.py b/flagscale/models/robobrain_x/groot_action_header.py
index 95895e65df..83dccc274e 100644
--- a/flagscale/models/robobrain_x/groot_action_header.py
+++ b/flagscale/models/robobrain_x/groot_action_header.py
@@ -213,7 +213,7 @@ def __init__(self, **kwargs):
 class FlowmatchingActionHead(nn.Module):
     def __init__(self, full_config):
         super().__init__()
-        config = full_config.framework.action_model
+        config = full_config.model.action_model
         self.no_random = config.get("no_random", True)
         self.hidden_size = config.hidden_size
         self.full_config = full_config
@@ -402,7 +402,7 @@ def get_action_model(config=None):
     Factory: build FlowmatchingActionHead from global framework config.
 
     Args:
-        config: Global config (expects config.framework.action_model namespace).
+        config: Global config (expects config.model.action_model namespace).
 
     Returns:
         FlowmatchingActionHead: Initialized FlowMatchingActionHead.
diff --git a/flagscale/models/robobrain_x/qwen2_5.py b/flagscale/models/robobrain_x/qwen2_5.py
index c471c4fde4..d32215eec8 100644
--- a/flagscale/models/robobrain_x/qwen2_5.py
+++ b/flagscale/models/robobrain_x/qwen2_5.py
@@ -51,10 +51,10 @@ def __init__(self, config: dict | None = None, **kwargs):
                 where:
                     framework.qwenvl.base_vlm (str): HuggingFace model id or local path.
                 Optional expected structure (illustrative):
-                    config.framework.get("qwenvl", {}) -> {
+                    config.model.qwenvl -> {
                         "base_vlm": "Qwen/Qwen2.5-VL-3B-Instruct"
                     }
-                    config.datasets.vla_data.get("CoT_prompt", str) may be used later in build_qwenvl_inputs.
+                    config.data.vla_data.get("CoT_prompt", str) may be used later in build_qwenvl_inputs.
             **kwargs:
                 Ignored currently; placeholder for future extension (e.g., override device_map, dtype).
 
@@ -74,7 +74,7 @@ def __init__(self, config: dict | None = None, **kwargs):
         """
         super().__init__()
 
-        qwenvl_config = config.framework.get("qwenvl", {})
+        qwenvl_config = config.model.qwenvl
         model_id = qwenvl_config.get("base_vlm", "Qwen/Qwen2.5-VL-3B-Instruct")
 
         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -191,7 +191,7 @@ def build_qwenvl_inputs(self, images, instructions, solutions=None, **kwargs):
                 Reserved for future extensions (e.g., system prompts, style controls, additional metadata).
 
         Config Dependencies:
-            self.config.datasets.vla_data.get("CoT_prompt", str):
+            self.config.data.vla_data.get("CoT_prompt", str):
                 If present, each instruction string is injected into the template by replacing "{instruction}".
 
         Returns:
@@ -230,8 +230,8 @@ def build_qwenvl_inputs(self, images, instructions, solutions=None, **kwargs):
         for imgs, instruction in zip(images, instructions):
             content = [{"type": "image", "image": img} for img in imgs]
 
-            if "CoT_prompt" in self.config.datasets.vla_data:  # If using a grounding prompt to task
-                CoT_prompt = self.config.datasets.vla_data.get("CoT_prompt", "")
+            if "CoT_prompt" in self.config.data.vla_data:  # If using a grounding prompt to task
+                CoT_prompt = self.config.data.vla_data.get("CoT_prompt", "")
                 prompt = CoT_prompt.replace("{instruction}", instruction)
             else:
                 prompt = instruction
diff --git a/flagscale/models/robobrain_x/qwen_groot.py b/flagscale/models/robobrain_x/qwen_groot.py
index 5388aeb3e7..d5994ab2d9 100644
--- a/flagscale/models/robobrain_x/qwen_groot.py
+++ b/flagscale/models/robobrain_x/qwen_groot.py
@@ -55,14 +55,14 @@ def __init__(self, config: dict | None = None, **kwargs) -> None:
         self.config = config
         self.qwen_vl_interface = _QWen_VL_Interface(config=self.config)
         # align dims --> we should put them to config or no?
-        self.config.framework.action_model.diffusion_model_cfg.cross_attention_dim = (
+        self.config.model.action_model.diffusion_model_cfg.cross_attention_dim = (
             self.qwen_vl_interface.model.config.hidden_size
         )
 
         self.action_model = FlowmatchingActionHead(full_config=self.config)
 
-        self.future_action_window_size = config.framework.action_model.future_action_window_size
-        self.past_action_window_size = config.framework.action_model.past_action_window_size
+        self.future_action_window_size = config.model.action_model.future_action_window_size
+        self.past_action_window_size = config.model.action_model.past_action_window_size
         self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size
 
     def forward(self, examples: list[dict] | None = None, **kwargs) -> tuple:
@@ -101,8 +101,8 @@ def forward(self, examples: list[dict] | None = None, **kwargs) -> tuple:
             ]  # (B, chunk_len, action_dim)
 
             repeated_diffusion_steps = (
-                self.config.trainer.get("repeated_diffusion_steps", 4)
-                if self.config and self.config.trainer
+                self.config.system.get("repeated_diffusion_steps", 4)
+                if self.config and self.config.system
                 else 4
             )
             actions_target_repeated = actions_target.repeat(repeated_diffusion_steps, 1, 1)
@@ -152,7 +152,7 @@ def predict_action(
             dict:
                 normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
         """
-        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        train_obs_image_size = getattr(self.config.data.vla_data, "image_size", None)
         if train_obs_image_size:
             batch_images = resize_images(batch_images, target_size=train_obs_image_size)
 
diff --git a/flagscale/models/vla/__init__.py b/flagscale/models/vla/__init__.py
new file mode 100644
index 0000000000..fe5d035505
--- /dev/null
+++ b/flagscale/models/vla/__init__.py
@@ -0,0 +1,30 @@
+from .action_models.flow_matching import FlowMatchingHead
+from .protocols import ActionModel, VLMBackbone
+from .qwen_gr00t import QwenGr00t
+from .registry import (
+    ACTION_MODEL_REGISTRY,
+    VLM_REGISTRY,
+    build_action_model,
+    build_vlm,
+    register_action_model,
+    register_vlm,
+)
+from .utils import get_vlm_config
+
+# Explicit registration
+from .vlm.qwen_vl import Qwen3VLBackbone, Qwen25VLBackbone
+
+VLM_REGISTRY["qwen2.5-vl"] = Qwen25VLBackbone
+VLM_REGISTRY["qwen3-vl"] = Qwen3VLBackbone
+ACTION_MODEL_REGISTRY["flow_matching"] = FlowMatchingHead
+
+__all__ = [
+    "VLMBackbone",
+    "ActionModel",
+    "register_vlm",
+    "register_action_model",
+    "build_vlm",
+    "build_action_model",
+    "get_vlm_config",
+    "QwenGr00t",
+]
diff --git a/flagscale/models/vla/action_models/__init__.py b/flagscale/models/vla/action_models/__init__.py
new file mode 100644
index 0000000000..91408b343c
--- /dev/null
+++ b/flagscale/models/vla/action_models/__init__.py
@@ -0,0 +1,3 @@
+from .flow_matching import FlowMatchingHead
+
+__all__ = ["FlowMatchingHead"]
diff --git a/flagscale/models/vla/action_models/flow_matching.py b/flagscale/models/vla/action_models/flow_matching.py
new file mode 100644
index 0000000000..8b36721e61
--- /dev/null
+++ b/flagscale/models/vla/action_models/flow_matching.py
@@ -0,0 +1,69 @@
+import torch
+import torch.nn as nn
+
+from flagscale.models.action_model.gr00t_action_header import (
+    FlowmatchingActionHead as _FlowmatchingActionHead,
+)
+from flagscale.models.utils.constants import ACTION
+from flagscale.models.vla.utils import get_vlm_config
+from flagscale.train.train_config import TrainConfig
+
+
+class FlowMatchingHead(nn.Module):
+    """
+    Flow matching action head wrapper for VLA framework.
+
+    Args:
+        vlm_config: HF config object from VLM (used to get hidden_size).
+        action_config: dict with action model settings.
+        full_config: TrainConfig for initializing the underlying FlowmatchingActionHead.
+    """
+
+    def __init__(self, vlm_config, action_config: dict, full_config: TrainConfig = None):
+        super().__init__()
+        vlm_info = get_vlm_config(vlm_config)
+        self.hidden_size = vlm_info["hidden_size"]
+
+        # TODO: pass cross_attention_dim directly to action head instead of mutating full_config
+        full_config.model.action_model.diffusion_model_cfg.cross_attention_dim = self.hidden_size
+
+        self._head = _FlowmatchingActionHead(full_config=full_config)
+
+    def forward(
+        self, vlm_output: dict[str, torch.Tensor], action_input: dict[str, torch.Tensor], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        """
+        Args:
+            vlm_output: From VLM, contains 'hidden_states'.
+            action_input: Raw batch with 'actions', 'state', etc.
+        Returns:
+            dict with 'loss'.
+        """
+        vl_embs = vlm_output["hidden_states"]
+        actions = action_input["actions"]
+        state = action_input.get("state")
+        encoder_attention_mask = action_input.get("attention_mask")
+
+        loss = self._head.forward(
+            vl_embs=vl_embs,
+            actions=actions,
+            state=state,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        return {"loss": loss}
+
+    def predict_action(
+        self, vlm_output: dict[str, torch.Tensor], action_input: dict[str, torch.Tensor], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        """
+        Args:
+            vlm_output: From VLM, contains 'hidden_states'.
+            action_input: Raw batch with 'state', etc.
+        Returns:
+            dict with 'actions': Tensor [B, horizon, action_dim].
+        """
+        vl_embs = vlm_output["hidden_states"]
+        state = action_input.get("state")
+
+        actions = self._head.predict_action(vl_embs=vl_embs, state=state)
+        return {ACTION: actions}
diff --git a/flagscale/models/vla/protocols.py b/flagscale/models/vla/protocols.py
new file mode 100644
index 0000000000..581ea61197
--- /dev/null
+++ b/flagscale/models/vla/protocols.py
@@ -0,0 +1,55 @@
+from typing import Protocol
+
+from torch import Tensor
+
+
+class VLMBackbone(Protocol):
+    @property
+    def config(self):
+        """HF config object (e.g., Qwen2VLConfig)."""
+        ...
+
+    def prepare_input(self, batch: dict) -> dict[str, Tensor]:
+        """
+        Args:
+            batch: Raw batch with 'image', 'lang', etc.
+        Returns:
+            Tokenized inputs ready for forward().
+        """
+        ...
+
+    def forward(self, batch: dict[str, Tensor], **kwargs) -> dict[str, Tensor]:
+        """
+        Args:
+            batch: Tokenized inputs from prepare_input().
+        Returns:
+            dict with 'hidden_states': tuple of layer outputs.
+        """
+        ...
+
+
+# TODO: (yupu) This `ActionModel` assumes that the VLA model is a composite of a VLM and an ActionModel.
+class ActionModel(Protocol):
+    def forward(
+        self, vlm_output: dict[str, Tensor], action_input: dict[str, Tensor], **kwargs
+    ) -> dict[str, Tensor]:
+        """
+        Args:
+            vlm_output: From VLM, contains 'hidden_states'.
+            action_input: Raw batch - pick what you need ('actions', 'state', etc.).
+        Returns:
+            dict with 'loss'.
+        """
+        ...
+
+    def predict(
+        self, vlm_output: dict[str, Tensor], action_input: dict[str, Tensor], **kwargs
+    ) -> dict[str, Tensor]:
+        """
+        Args:
+            vlm_output: From VLM, contains 'hidden_states'.
+            action_input: Raw batch - pick what you need ('state', etc.).
+        Returns:
+            dict with 'actions': Tensor [B, horizon, action_dim].
+        """
+        ...
diff --git a/flagscale/models/vla/qwen_gr00t.py b/flagscale/models/vla/qwen_gr00t.py
new file mode 100644
index 0000000000..6d822e94ae
--- /dev/null
+++ b/flagscale/models/vla/qwen_gr00t.py
@@ -0,0 +1,212 @@
+# Mainly adopted from:
+# https://github.com/starVLA/starVLA/blob/3f7feefbc5fc25890ad3a7d262b8a0aea1339aa7/starVLA/model/framework/QwenGR00T.py
+# Below is the original copyright:
+
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Junqiu YU / Fudan University] in [2025].
+# Design and Merged by [Jinhui YE / HKUST University] in [2025].
+
+"""
+Qwen-GR00T Framework
+A lightweight implementation that Qwen-VL + Flow-matching head to directly predict continuous actions
+Flow-matching header is copyright from GR00T N1.5,
+"""
+
+import torch
+from transformers import PretrainedConfig, PreTrainedModel
+
+from flagscale.models.utils.constants import ACTION
+from flagscale.models.vla.registry import build_action_model, build_vlm
+from flagscale.train.train_config import TrainConfig
+
+
+class QwenGr00t(PreTrainedModel):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen VL interface for fused language/vision token embeddings
+      - DiT diffusion head for future action sequence modeling
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    config_class = PretrainedConfig
+
+    def __init__(self, config: TrainConfig, **kwargs):
+        super().__init__(PretrainedConfig())
+        self._config = config
+
+        # DEBUG: Track random state before VLM creation
+        print(f"[DEBUG RNG] Before VLM: torch state[:10] = {torch.get_rng_state()[:10].tolist()}")
+
+        vlm_type = config.model.vlm.get("type", "qwen3-vl")
+        self.vlm = build_vlm(vlm_type, config=config)
+
+        # DEBUG: Track random state after VLM creation
+        print(f"[DEBUG RNG] After VLM: torch state[:10] = {torch.get_rng_state()[:10].tolist()}")
+
+        action_model_type = config.model.action_model.get("type", "flow_matching")
+        self.action_model = build_action_model(
+            action_model_type,
+            vlm_config=self.vlm.model_config,
+            action_config={},
+            full_config=config,
+        )
+
+        self.future_action_window_size = config.model.action_model.future_action_window_size
+
+        # DEBUG: Track random state after action model creation
+        print(
+            f"[DEBUG RNG] After action_model: torch state[:10] = {torch.get_rng_state()[:10].tolist()}"
+        )
+
+        # DEBUG: Print action encoder weights to verify initialization matches starVLA
+        if hasattr(self.action_model, "_head") and hasattr(
+            self.action_model._head, "action_encoder"
+        ):
+            ae = self.action_model._head.action_encoder
+            print(
+                f"[DEBUG INIT] action_encoder.layer1.weight[:3,:5]: {ae.layer1.weight[:3, :5].tolist()}"
+            )
+            print(
+                f"[DEBUG INIT] action_encoder.layer1.weight sum: {ae.layer1.weight.sum().item():.6f}"
+            )
+
+    def forward(self, examples: dict, **kwargs):
+        """ """
+        # actions = [example["action"] for example in examples]  # [B, T, action_dim]
+        actions = examples[ACTION]
+        state = None  # examples[OBS_STATE]
+
+        # Step 1: QWenVL input format
+        # NOTE: (yupu) The order of the images differs from starVLA, which is [image, wrist_image]
+        qwen_inputs = self.vlm.prepare_input(examples)
+
+        # DEBUG: Print qwen_inputs stats
+        # print(f"[DEBUG] qwen_inputs keys: {qwen_inputs.keys()}")
+        # print(f"[DEBUG] input_ids shape: {qwen_inputs['input_ids'].shape}")
+        # print(f"[DEBUG] input_ids sum: {qwen_inputs['input_ids'].sum().item()}")
+
+        # qwen_inputs = torch.load("/share/project/fengyupu/github/starVLA/qwen_inputs_debug.pt", weights_only=False)
+        # torch.testing.assert_close(qwen_inputs, qwen_inputs_debug)
+
+        # torch.save(qwen_inputs, "qwen_inputs.pt")
+
+        # TODO: (yupu) Hard-coded autocast and dtype, matches starVLA
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            vlm_output = self.vlm.forward(qwen_inputs, output_attentions=False)
+            # last_hidden_state: [B, seq_len, H]
+            last_hidden = vlm_output["hidden_states"][-1]  # [B, L, H]
+            # print(f"[DEBUG] last_hidden shape: {last_hidden.shape}, dtype: {last_hidden.dtype}")
+            # print(
+            #     f"[DEBUG] last_hidden norm: {last_hidden.norm().item():.4f}, mean: {last_hidden.mean().item():.6f}, std: {last_hidden.std().item():.6f}"
+            # )
+
+        # Step 4: Action Expert Forward and Loss
+        with torch.autocast("cuda", dtype=torch.float32):
+            # TODO: (yupu) Is this a bug or a feature? The action dtype would stay as bf16 under this autocast.
+            actions = actions.to(device=last_hidden.device, dtype=last_hidden.dtype)
+            # actions = torch.tensor(
+            #     np.array(actions), device=last_hidden.device, dtype=last_hidden.dtype
+            # )  # [B, T_full, action_dim]
+
+            # TODO: does not match RoboBrainX, need to check
+            actions_target = actions[
+                :, -(self.future_action_window_size + 1) :, :
+            ]  # (B, chunk_len, action_dim)
+
+            # TODO: (yupu) I believe there is a bug in starVLA, the
+            # `repeated_diffusion_steps` is not properly set in the config.
+            repeated_diffusion_steps = self._config.model.action_model.get(
+                "repeated_diffusion_steps", 4
+            )
+
+            # print(f"[DEBUG] actions_target shape before repeat: {actions_target.shape}")
+            # print(f"[DEBUG] actions_target sum: {actions_target.sum().item():.4f}")
+            # print(f"[DEBUG] actions_target[0,0,:5]: {actions_target[0, 0, :5].tolist()}")
+            # print(f"[DEBUG] repeated_diffusion_steps: {repeated_diffusion_steps}")
+
+            actions_repeated = actions_target.repeat(repeated_diffusion_steps, 1, 1)
+            last_hidden_repeated = last_hidden.repeat(repeated_diffusion_steps, 1, 1)
+
+            # print(f"[DEBUG] actions_repeated shape: {actions_repeated.shape}")
+            # print(f"[DEBUG] last_hidden_repeated shape: {last_hidden_repeated.shape}")
+
+            state_repeated = None
+            if state is not None:
+                state = state.to(device=last_hidden.device, dtype=last_hidden.dtype)
+                state_repeated = state.repeat(repeated_diffusion_steps, 1, 1)
+
+            # Use action head forward API
+            vlm_output_repeated = {"hidden_states": last_hidden_repeated}
+            action_input = {"actions": actions_repeated, "state": state_repeated}
+
+            # torch.save(vlm_output_repeated, "vlm_output_repeated.pt")
+            # torch.save(action_input, "action_input.pt")
+
+            output = self.action_model.forward(vlm_output_repeated, action_input)
+
+            # torch.save(output, "output.pt")
+
+        # print(f"output: {output}")
+        # assert False
+
+        return output["loss"]
+
+    @torch.inference_mode()
+    def predict_action(self, examples: list[dict], **kwargs) -> dict:
+        """
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+          6. Return normalized action trajectory
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+        """
+        # TODO: (yupu) Fix inference input format to use constants (OBS_IMAGE, OBS_LANGUAGE, OBS_STATE)
+        # instead of hardcoded keys. The current keys are inconsistent with training batch format.
+        # batch_images = [[to_pil_preserve(example["image"])] for example in examples]  # [B, [PLT]]
+        # instructions = [example["lang"] for example in examples]  # [B, str]
+
+        # We assume the images are already resized during preprocessing.
+        qwen_inputs = self.vlm.prepare_input(examples)
+        state = None  # examples[OBS_STATE]
+
+        # state = (
+        #     [example["state"] for example in examples] if "state" in examples[0] else None
+        # )  # [B, 1, state_dim]
+
+        # train_obs_image_size = getattr(self._config.data.vla_data, "image_size", None)
+        # if train_obs_image_size:
+        #     batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+
+        # # Step 1: QWenVL input format
+        # qwen_inputs = self.vlm.build_qwenvl_inputs(
+        #     examples=None, images=batch_images, instructions=instructions
+        # )
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            vlm_output = self.vlm.forward(qwen_inputs, output_attentions=False)
+            # last_hidden_state: [B, seq_len, H]
+            last_hidden = vlm_output["hidden_states"][-1]  # [B, L, H]
+
+        if state is not None:
+            state = state.to(device=last_hidden.device, dtype=last_hidden.dtype)
+
+        # state_tensor = (
+        #     torch.from_numpy(np.array(state)).to(last_hidden.device, dtype=last_hidden.dtype)
+        #     if state is not None
+        #     else None
+        # )
+
+        # Step 4: Action Expert Forward
+        with torch.autocast("cuda", dtype=torch.float32):
+            vlm_output_for_action = {"hidden_states": last_hidden}
+            action_input = {"state": state}
+            output = self.action_model.predict_action(vlm_output_for_action, action_input)
+
+        # Assume the output of the action moadel is dict mapps `ACTION` to the normalized actions
+        return output
diff --git a/flagscale/models/vla/registry.py b/flagscale/models/vla/registry.py
new file mode 100644
index 0000000000..180e6a1961
--- /dev/null
+++ b/flagscale/models/vla/registry.py
@@ -0,0 +1,32 @@
+VLM_REGISTRY: dict[str, type] = {}
+ACTION_MODEL_REGISTRY: dict[str, type] = {}
+
+
+def register_vlm(name: str):
+    def decorator(cls):
+        VLM_REGISTRY[name] = cls
+        return cls
+
+    return decorator
+
+
+def register_action_model(name: str):
+    def decorator(cls):
+        ACTION_MODEL_REGISTRY[name] = cls
+        return cls
+
+    return decorator
+
+
+def build_vlm(name: str, **kwargs):
+    if name not in VLM_REGISTRY:
+        raise ValueError(f"Unknown VLM: {name}. Available: {list(VLM_REGISTRY.keys())}")
+    return VLM_REGISTRY[name](**kwargs)
+
+
+def build_action_model(name: str, vlm_config, action_config: dict, **kwargs):
+    if name not in ACTION_MODEL_REGISTRY:
+        raise ValueError(
+            f"Unknown ActionModel: {name}. Available: {list(ACTION_MODEL_REGISTRY.keys())}"
+        )
+    return ACTION_MODEL_REGISTRY[name](vlm_config=vlm_config, action_config=action_config, **kwargs)
diff --git a/flagscale/models/vla/utils.py b/flagscale/models/vla/utils.py
new file mode 100644
index 0000000000..b0ee5fdd12
--- /dev/null
+++ b/flagscale/models/vla/utils.py
@@ -0,0 +1,29 @@
+def get_vlm_config(vlm_config) -> dict:
+    """
+    Extract common fields from any VLM config, handling structural differences.
+
+    Args:
+        vlm_config: HF config object (may have hidden_size directly or via text_config).
+    Returns:
+        dict with 'hidden_size' and 'num_hidden_layers'.
+    """
+    return {
+        "hidden_size": _get_hidden_size(vlm_config),
+        "num_hidden_layers": _get_num_layers(vlm_config),
+    }
+
+
+def _get_hidden_size(config) -> int:
+    if hasattr(config, "hidden_size"):
+        return config.hidden_size
+    if hasattr(config, "text_config"):
+        return config.text_config.hidden_size
+    raise ValueError(f"Cannot determine hidden_size from config: {type(config)}")
+
+
+def _get_num_layers(config) -> int:
+    if hasattr(config, "num_hidden_layers"):
+        return config.num_hidden_layers
+    if hasattr(config, "text_config"):
+        return config.text_config.num_hidden_layers
+    raise ValueError(f"Cannot determine num_hidden_layers from config: {type(config)}")
diff --git a/flagscale/models/vla/vlm/__init__.py b/flagscale/models/vla/vlm/__init__.py
new file mode 100644
index 0000000000..aa73dba2e0
--- /dev/null
+++ b/flagscale/models/vla/vlm/__init__.py
@@ -0,0 +1,3 @@
+from .qwen_vl import Qwen3VLBackbone, Qwen25VLBackbone, QwenVLBackbone
+
+__all__ = ["QwenVLBackbone", "Qwen25VLBackbone", "Qwen3VLBackbone"]
diff --git a/flagscale/models/vla/vlm/qwen_vl.py b/flagscale/models/vla/vlm/qwen_vl.py
new file mode 100644
index 0000000000..b6d45e38db
--- /dev/null
+++ b/flagscale/models/vla/vlm/qwen_vl.py
@@ -0,0 +1,349 @@
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoProcessor,
+    PretrainedConfig,
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen3VLForConditionalGeneration,
+)
+
+from flagscale.train.train_config import TrainConfig
+from flagscale.train.utils.image_tools import to_pil_preserve
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+
+_ACTION_TOKEN_MIN_QWEN25 = 151665
+_ACTION_TOKEN_MAX_QWEN25 = 153712
+_ACTION_TOKEN_MIN_QWEN3 = 151669
+_ACTION_TOKEN_MAX_QWEN3 = 153716
+
+
+class QwenVLBackbone(nn.Module):
+    """
+    Base class for Qwen VL backends.
+
+    Args:
+        config: TrainConfig object with config.model.qwenvl namespace.
+    """
+
+    def __init__(self, config: TrainConfig, **kwargs):
+        super().__init__()
+        qwenvl_config = config.model.qwenvl
+        self.model_id = qwenvl_config.base_vlm
+
+        # TODO: (yupu) The model loaded by `from_pretrained` is eval mode by default, is this expected? I removed `policy.train()` in train_qwen_gr00t.py to match starVLA, but not sure if this is the right way to do this.
+        self.model = self._load_model(self.model_id)
+        self.processor = AutoProcessor.from_pretrained(self.model_id)
+        # FIXME: Hard-coded padding side
+        self.processor.tokenizer.padding_side = "left"
+        self._config: TrainConfig = config
+
+    def _load_model(self, model_id: str):
+        raise NotImplementedError
+
+    @property
+    def model_config(self) -> PretrainedConfig:
+        """HF config object (e.g., Qwen2VLConfig)."""
+        return self.model.config
+
+    def prepare_input(self, batch: dict) -> dict[str, torch.Tensor]:
+        raise NotImplementedError
+
+    def forward(self, batch: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            outputs = self.model(
+                **batch,
+                output_hidden_states=True,
+                return_dict=True,
+                **kwargs,
+            )
+        # TODO: (yupu) We should output the original outputs, not just the hidden states.
+        return {"hidden_states": outputs.hidden_states}
+
+
+class Qwen25VLBackbone(QwenVLBackbone):
+    """Qwen2.5-VL backend."""
+
+    def __init__(self, config: TrainConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self._ACTION_TOKEN_MIN = _ACTION_TOKEN_MIN_QWEN25
+        self._ACTION_TOKEN_MAX = _ACTION_TOKEN_MAX_QWEN25
+
+    def _load_model(self, model_id: str):
+        # WARNING: hard-coded attn_implementation and torch_dtype
+        return Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_id,
+            attn_implementation="flash_attention_2",
+            torch_dtype="auto",
+        )
+
+    def prepare_input(self, batch: dict) -> dict[str, torch.Tensor]:
+        # TODO: (yupu) This is a hack, we should find a better way to handle this.
+        image_keys = self._config.data.vla_data.image_features
+        return self.build_qwenvl_inputs(examples=batch, image_keys=image_keys)
+
+    def build_qwenvl_inputs(
+        self,
+        examples,
+        images=None,
+        instructions=None,
+        image_keys=None,
+        solutions=None,
+        **kwargs,
+    ):
+        # TODO: (yupu) This is so ugly, we should find a better way to handle this.
+        def _tensor_to_pil_list(batch_tensor):
+            if not isinstance(batch_tensor, torch.Tensor):
+                return batch_tensor
+            if batch_tensor.ndim == 3:
+                batch_tensor = batch_tensor.unsqueeze(0)
+            if batch_tensor.ndim != 4:
+                raise ValueError(f"Expected image tensor with 4 dims, got {batch_tensor.shape}")
+            pil_images = []
+            for item in batch_tensor:
+                if item.shape[-1] in (1, 3, 4):
+                    img = item
+                else:
+                    img = item.permute(1, 2, 0)
+                pil_images.append(to_pil_preserve(img.detach().cpu().numpy()))
+            return pil_images
+
+        if examples is not None and (images is None or instructions is None):
+            # TODO: (yupu) hard-code task key to "task"
+            instructions = examples["task"]
+            if isinstance(instructions, torch.Tensor):
+                instructions = instructions.detach().cpu().tolist()
+            if isinstance(instructions, str):
+                instructions = [instructions]
+
+            batch_images = None
+            for key in image_keys:
+                key_images = _tensor_to_pil_list(examples[key])
+                if batch_images is None:
+                    batch_images = [[img] for img in key_images]
+                else:
+                    for sample_images, img in zip(batch_images, key_images):
+                        sample_images.append(img)
+
+            for idx, sample_images in enumerate(batch_images):
+                batch_images[idx] = [img for img in sample_images if img is not None]
+
+            images = batch_images
+
+        from qwen_vl_utils import process_vision_info
+
+        # Create messages: one message per sample
+        messages = []
+        assert len(images) == len(instructions)
+        for imgs, instruction in zip(images, instructions):
+            content = [{"type": "image", "image": img} for img in imgs]
+
+            if "CoT_prompt" in self._config.data.vla_data:
+                CoT_prompt = self._config.data.vla_data.get("CoT_prompt", "")
+                prompt = CoT_prompt.replace("{instruction}", instruction)
+            else:
+                prompt = instruction
+
+            content.append({"type": "text", "text": prompt})
+            msg = [{"role": "user", "content": content}]
+
+            if solutions is not None:
+                solution = solutions[len(messages)]
+                msg.append({"role": "assistant", "content": [{"type": "text", "text": solution}]})
+            messages.append(msg)
+
+        # Prepare text prompts using processor
+        # default process is json --> message --> texts --> input_ids
+        texts = [
+            self.processor.apply_chat_template(m, tokenize=False, add_generation_prompt=True)
+            for m in messages
+        ]
+
+        # image_inputs = list of PIL
+        image_inputs, video_inputs = process_vision_info(messages)
+        batch_input = self.processor(
+            text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt"
+        )
+
+        # if solutions, mask out the non solution tokens in labels --> @JinhuiYE can we mask out system prompt?
+        if solutions is not None:
+            # how can we know this range? --> we has other way for this, but is slower see qwenhelix branch
+            # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+            labels = batch_input["input_ids"].clone()
+            # For each sequence in the batch, find the first occurrence of an action token.
+            for i in range(labels.size(0)):
+                seq = labels[i]
+                # Create a mask for tokens within the action token range.
+                mask_seq = (seq >= self._ACTION_TOKEN_MIN) & (seq <= self._ACTION_TOKEN_MAX)
+                nonzero_indices = torch.nonzero(mask_seq, as_tuple=False)
+                if nonzero_indices.numel() > 0:
+                    first_action_index = nonzero_indices[0].item()
+                    # Mask out all tokens before the first action token.
+                    seq[:first_action_index] = IGNORE_INDEX
+                else:
+                    seq[:] = IGNORE_INDEX
+                    RuntimeWarning(
+                        "action token are on in your tokenizer, plz see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md."
+                    )
+            labels[labels == self.processor.tokenizer.pad_token_id] = IGNORE_INDEX
+            batch_input["labels"] = labels
+
+        return batch_input.to(self.model.device)
+
+
+class Qwen3VLBackbone(QwenVLBackbone):
+    """Qwen3-VL backend."""
+
+    def __init__(self, config: TrainConfig, **kwargs):
+        super().__init__(config, **kwargs)
+
+        # Only for fast base model
+        if "-Action" in self.model_id:
+            self._ACTION_TOKEN_MIN = _ACTION_TOKEN_MIN_QWEN3
+            self._ACTION_TOKEN_MAX = _ACTION_TOKEN_MAX_QWEN3
+
+    def _load_model(self, model_id: str) -> Qwen3VLForConditionalGeneration:
+        # FIXME: hard-coded attn_implementation and torch_dtype matches starVLA
+        # TODO: (yupu): During inference/serving, it's required to load model twice, not only that, the original qwen model has to be loaded!
+        model = Qwen3VLForConditionalGeneration.from_pretrained(
+            model_id,
+            attn_implementation="flash_attention_2",
+            torch_dtype=torch.bfloat16,
+        )
+        # Align dims qwen3 with qwen2.5, actually it's not needed in our case
+        model.config.hidden_size = model.config.text_config.hidden_size
+        return model
+
+    def prepare_input(self, batch: dict) -> dict[str, torch.Tensor]:
+        # TODO: (yupu) This is a hack, we should find a better way to handle this.
+        # image_keys = self._config.data.vla_data.image_features.keys()
+        image_keys = ["observation.images.image", "observation.images.wrist_image"]
+
+        # Extract data in starVLA format (list of dicts)
+        # examples = batch
+        # batch_images = [example["image"] for example in examples]  # [B, [PIL]]
+        # instructions = [example["lang"] for example in examples]  # [B, str]
+        # actions = [example["action"] for example in examples]  # [B, T, action_dim]
+        # state = None
+
+        # return self.build_qwenvl_inputs(
+        #     examples=None, images=batch_images, instructions=instructions
+        # )
+
+        return self.build_qwenvl_inputs(examples=batch, image_keys=image_keys)
+
+    # TODO: (yupu) Refactor this args
+    def build_qwenvl_inputs(
+        self,
+        examples,
+        images=None,
+        instructions=None,
+        image_keys=None,
+        solutions=None,
+        **kwargs,
+    ):
+        # TODO: (yupu) This is so ugly, we should find a better way to handle this.
+        def _tensor_to_pil_list(batch_tensor):
+            if not isinstance(batch_tensor, torch.Tensor):
+                return batch_tensor
+            if batch_tensor.ndim == 3:
+                batch_tensor = batch_tensor.unsqueeze(0)
+            if batch_tensor.ndim != 4:
+                raise ValueError(f"Expected image tensor with 4 dims, got {batch_tensor.shape}")
+            pil_images = []
+            for item in batch_tensor:
+                if item.shape[-1] in (1, 3, 4):
+                    img = item
+                else:
+                    img = item.permute(1, 2, 0)
+                pil_images.append(to_pil_preserve(img.detach().cpu().numpy()))
+            return pil_images
+
+        if examples is not None and (images is None or instructions is None):
+            # TODO: (yupu) hard-code task key to "task"
+            instructions = examples["task"]
+            if isinstance(instructions, torch.Tensor):
+                instructions = instructions.detach().cpu().tolist()
+            if isinstance(instructions, str):
+                instructions = [instructions]
+
+            batch_images = None
+            for key in image_keys:
+                key_images = _tensor_to_pil_list(examples[key])
+                if batch_images is None:
+                    batch_images = [[img] for img in key_images]
+                else:
+                    for sample_images, img in zip(batch_images, key_images):
+                        sample_images.append(img)
+
+            for idx, sample_images in enumerate(batch_images):
+                batch_images[idx] = [img for img in sample_images if img is not None]
+
+            images = batch_images
+
+        # import numpy as np
+
+        # torch.save(np.array([np.array(img) for img in images[0]]), "raw_images.pt")
+        # assert False
+
+        # Create messages: one message per sample
+        messages = []
+        assert len(images) == len(instructions)
+        for imgs, instruction in zip(images, instructions):
+            content = [{"type": "image", "image": img} for img in imgs]
+
+            if "CoT_prompt" in self._config.data.vla_data:
+                CoT_prompt = self._config.data.vla_data.get("CoT_prompt", "")
+                prompt = CoT_prompt.replace("{instruction}", instruction)
+            else:
+                prompt = instruction
+
+            content.append({"type": "text", "text": prompt})
+            msg = [{"role": "user", "content": content}]
+
+            if solutions is not None:
+                solution = solutions[len(messages)]
+                msg.append({"role": "assistant", "content": [{"type": "text", "text": solution}]})
+            messages.append(msg)
+
+        # Preparation for inference
+        batch_inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            padding=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+
+        # if solutions, mask out the solution tokens in labels
+        # here only for fast_tokenizer now.
+        if solutions is not None:
+            # how can we know this range? --> we has other way for this, but is slower see qwenhelix branch
+            # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+            labels = batch_inputs["input_ids"].clone()
+            # For each sequence in the batch, find the first occurrence of an action token.
+            for i in range(labels.size(0)):
+                seq = labels[i]
+                # Create a mask for tokens within the action token range.
+                mask_seq = (seq >= self._ACTION_TOKEN_MIN) & (seq <= self._ACTION_TOKEN_MAX)
+                nonzero_indices = torch.nonzero(mask_seq, as_tuple=False)
+                if nonzero_indices.numel() > 0:
+                    # Mask out all tokens before the first action token.
+                    seq[: nonzero_indices[0].item()] = IGNORE_INDEX
+                else:
+                    # If no action token is found, mask the entire sequence.
+                    seq[:] = IGNORE_INDEX
+                    RuntimeWarning(
+                        "action token are on in your tokenizer, plz see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md."
+                    )
+
+            # Mask out pad tokens as well
+            labels[labels == self.processor.tokenizer.pad_token_id] = IGNORE_INDEX
+            batch_inputs["labels"] = labels
+
+        return batch_inputs.to(self.model.device)
diff --git a/flagscale/models/vlm/__init__.py b/flagscale/models/vlm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/flagscale/models/vlm/qwen2_5_vl.py b/flagscale/models/vlm/qwen2_5_vl.py
new file mode 100644
index 0000000000..b3981c04b6
--- /dev/null
+++ b/flagscale/models/vlm/qwen2_5_vl.py
@@ -0,0 +1,294 @@
+# Mainly adopted from:
+# https://github.com/starVLA/starVLA/blob/3f7feefbc5fc25890ad3a7d262b8a0aea1339aa7/starVLA/model/modules/vlm/QWen2_5.py
+# Below is the original copyright:
+
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+
+
+import torch
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+
+_ACTION_TOKEN_MIN = 151665  # how can we know this range?
+_ACTION_TOKEN_MAX = 153712  # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+
+
+import torch.nn as nn
+
+
+class _QWen_VL_Interface(nn.Module):
+    """
+    This exists because of the diversity of VLMs, so we encapsulate the changes here.
+    Lightweight wrapper around Qwen2.5-VL (Qwen2_5_VLForConditionalGeneration).
+
+    Purpose:
+        - Unify interface with other VLM backends (CausalLM-like usage).
+        - Centralize preprocessing (tokenization + multimodal packing).
+        - Provide consistent forward / generate signatures.
+
+    Notes:
+        - Keeps original model behavior; does not modify internal architecture.
+        - Mixed precision handled via torch.autocast in forward / generate.
+        - Adaptation layer can be extended for future multi-modal routing if needed.
+    """
+
+    def __init__(self, config: dict | None = None, **kwargs):
+        """
+        Initialize the Qwen2.5-VL wrapper.
+
+        Parameters:
+            config (dict | Any | None):
+                Expected to expose a nested attribute/namespace `framework.get("qwenvl", {})`
+                where:
+                    framework.qwenvl.base_vlm (str): HuggingFace model id or local path.
+                Optional expected structure (illustrative):
+                    config.model.qwenvl -> {
+                        "base_vlm": "Qwen/Qwen2.5-VL-3B-Instruct"
+                    }
+                    config.data.vla_data.get("CoT_prompt", str) may be used later in build_qwenvl_inputs.
+            **kwargs:
+                Ignored currently; placeholder for future extension (e.g., override device_map, dtype).
+
+        Side Effects:
+            - Downloads / loads pretrained Qwen2.5-VL weights (unless cached).
+            - Instantiates AutoProcessor and enforces left padding (required for some FlashAttention paths).
+
+        Attributes Set:
+            self.model (Qwen2_5_VLForConditionalGeneration)
+            self.processor (AutoProcessor)
+            self.config (original config reference)
+
+        Notes:
+            - device_map='cuda' is passed to from_pretrained (single or multi-GPU depending on HF accelerate mapping).
+            - torch_dtype='auto' lets HF decide best available (prefers bfloat16 on supported hardware).
+            - tokenizer padding_side forced to 'left' (important for generation + KV caching alignment).
+        """
+        super().__init__()
+
+        qwenvl_config = config.model.qwenvl
+        model_id = qwenvl_config.get("base_vlm", "Qwen/Qwen2.5-VL-3B-Instruct")
+
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_id,
+            # attn_implementation="flash_attention_2",
+            attn_implementation=qwenvl_config.get("attn_implementation", "eager"),
+            torch_dtype="auto",
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+        processor.tokenizer.padding_side = "left"
+
+        self.model = model
+        self.processor = processor
+        self.config = config
+
+        self._ACTION_TOKEN_MIN = _ACTION_TOKEN_MIN
+        self._ACTION_TOKEN_MAX = _ACTION_TOKEN_MAX
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        image_grid_thw: torch.FloatTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = False,
+        output_hidden_states: bool | None = True,
+        return_dict: bool | None = True,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """
+        Forward pass delegating to underlying Qwen2.5-VL backbone.
+
+        Args:
+            input_ids (LongTensor | None): [B, T] token ids (mutually exclusive with inputs_embeds).
+            attention_mask (Tensor | None): [B, T], 1 = attend, 0 = masked.
+            pixel_values (FloatTensor | None): Vision batch (model-specific preprocessed shape).
+            labels (LongTensor | None): [B, T] LM targets; ignored positions = -100 (IGNORE_INDEX).
+            image_grid_thw (FloatTensor | None): Optional tiling metadata (e.g., [B, 3] for temporal/height/width splits).
+            inputs_embeds (FloatTensor | None): [B, T, D] alternative embedding input.
+            past_key_values (List[FloatTensor] | None): Cached KV states for incremental decoding.
+            use_cache (bool | None): If True, returns updated past_key_values.
+            output_attentions (bool): Whether to include attention maps.
+            output_hidden_states (bool): Must be True if downstream modules consume hidden states.
+            return_dict (bool): Return HF dataclass if True; else tuple.
+            **kwargs: Extra args forwarded to underlying model.
+
+        Returns:
+            CausalLMOutputWithPast | tuple: HF-standard structure (logits, past_key_values, hidden_states, etc.).
+
+        Notes:
+            - Autocast(bfloat16) used for efficiency.
+            - padding_side already set to 'left' in tokenizer at init.
+            - Hidden states required for auxiliary alignment or feature extraction modules.
+        """
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                **kwargs,
+            )
+
+        return outputs
+
+    def generate(
+        self,
+        **kwargs,
+    ):
+        """
+        High-level generation interface (auto-regressive decoding), optionally vision-conditioned.
+
+        Args:
+            **kwargs: fully follow raw model.generate() signature.
+        Returns:
+            GenerateOutput | Model-dependent generation return.
+        """
+        with torch.autocast("cuda", dtype=torch.float16):
+            generation_output = self.model.generate(
+                **kwargs,
+            )
+        return generation_output
+
+    def build_qwenvl_inputs(self, images, instructions, solutions=None, **kwargs):
+        """
+        Construct and tokenize multimodal chat-style inputs for Qwen2.5-VL (batched).
+
+        Overview:
+            For each sample i:
+                - Takes a list of PIL images: images[i] = [img_0, img_1, ...]
+                - Takes a matching instruction string instructions[i]
+                - Optionally formats instruction with a chain-of-thought template (CoT_prompt) if present in config.
+                - Builds a single-turn chat message containing:
+                      [{"role": "user", "content": [
+                          {"type": "image", "image": <PIL.Image>}, ...,
+                          {"type": "text", "text": <final_prompt>}
+                      ]}]
+                - Applies processor.apply_chat_template(..., add_generation_prompt=True)
+                - Extracts vision inputs via process_vision_info
+                - Calls processor(...) to produce a BatchFeature with token + vision tensors.
+
+        Parameters:
+            images (List[List[PIL.Image.Image]]):
+                Length B. Each element is a (possibly empty) list of PIL images associated with that instruction.
+                Supports multi-image inputs (ordered). For video-as-frames, upstream code should decide packaging.
+            instructions (List[str]):
+                Length B textual prompts or task instructions.
+            **kwargs:
+                Reserved for future extensions (e.g., system prompts, style controls, additional metadata).
+
+        Config Dependencies:
+            self.config.data.vla_data.get("CoT_prompt", str):
+                If present, each instruction string is injected into the template by replacing "{instruction}".
+
+        Returns:
+            BatchFeature (HF):
+                Typical keys (moved to self.model.device):
+                    input_ids: LongTensor [B, T]
+                    attention_mask: LongTensor/Bool [B, T]
+                    pixel_values / image_grid / video specifics (model-dependent)
+                    (Possibly) token_type_ids or other processor outputs
+                The structure aligns with what Qwen2_5_VLForConditionalGeneration.forward expects.
+
+        Shapes / Notes:
+            - Sequence length T varies by number of images (special tokens) + prompt length.
+            - pixel_values may have internal batching distinct from B if images are flattened; underlying model maps them.
+            - The association between images and textual placeholders is preserved by processor ordering.
+
+        Edge Cases:
+            - Empty image list per sample is allowed (pure text prompt).
+            - Mismatched lengths of images and instructions raise AssertionError.
+            - CoT prompt replacement is naive string replace; ensure template contains "{instruction}" placeholder.
+
+        Performance:
+            - This path aims for faster inference vs. more granular per-turn assembly.
+            - Minor tokenization differences (e.g., whitespace) can affect highly overfitted benchmarks.
+
+        Does Not:
+            - Perform augmentation.
+            - Cache processed pixel tensors.
+            - Handle streaming input.
+
+        """
+
+        # Create messages: one message per sample
+        messages = []
+        assert len(images) == len(instructions), "Images and instructions must have the same length"
+        for imgs, instruction in zip(images, instructions):
+            content = [{"type": "image", "image": img} for img in imgs]
+
+            if "CoT_prompt" in self.config.data.vla_data:  # If using a grounding prompt to task
+                CoT_prompt = self.config.data.vla_data.get("CoT_prompt", "")
+                prompt = CoT_prompt.replace("{instruction}", instruction)
+            else:
+                prompt = instruction
+
+            content.append({"type": "text", "text": prompt})
+            msg = [{"role": "user", "content": content}]
+
+            if solutions is not None:
+                solution = solutions[len(messages)]
+                msg.append({"role": "assistant", "content": [{"type": "text", "text": solution}]})
+            messages.append(msg)
+
+        # Prepare text prompts using processor
+        # default process is json --> message --> texts --> input_ids
+        texts = [
+            self.processor.apply_chat_template(m, tokenize=False, add_generation_prompt=True)
+            for m in messages
+        ]
+
+        # image_inputs = list of PIL
+        image_inputs, video_inputs = process_vision_info(messages)
+        batch_input = self.processor(
+            text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt"
+        )
+
+        # if solutions, mask out the non solution tokens in labels --> @JinhuiYE can we mask out system prompt?
+        if solutions is not None:
+            action_token_min = _ACTION_TOKEN_MIN  # how can we know this range? --> we has other way for this, but is slower see qwenhelix branch
+            action_token_max = _ACTION_TOKEN_MAX  # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+            labels = batch_input["input_ids"].clone()
+            # For each sequence in the batch, find the first occurrence of an action token.
+            for i in range(labels.size(0)):
+                seq = labels[i]
+                # Create a mask for tokens within the action token range.
+                mask_seq = (seq >= action_token_min) & (seq <= action_token_max)
+                nonzero_indices = torch.nonzero(mask_seq, as_tuple=False)
+                if nonzero_indices.numel() > 0:
+                    first_action_index = nonzero_indices[0].item()
+                    # Mask out all tokens before the first action token.
+                    seq[:first_action_index] = IGNORE_INDEX
+                else:
+                    # If no action token is found, mask the entire sequence.
+                    seq[:] = IGNORE_INDEX
+                    RuntimeWarning(
+                        "action token are on in your tokenizer, plz see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md."
+                    )
+
+            labels[
+                labels == self.processor.tokenizer.pad_token_id
+            ] = -100  ## mask out pad tokens as well
+            batch_input["labels"] = labels
+
+        return batch_input.to(self.model.device)
diff --git a/flagscale/models/vlm/qwen3_vl.py b/flagscale/models/vlm/qwen3_vl.py
new file mode 100644
index 0000000000..651225f46d
--- /dev/null
+++ b/flagscale/models/vlm/qwen3_vl.py
@@ -0,0 +1,257 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+
+
+import torch
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from flagscale.train.utils.image_tools import to_pil_preserve
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+
+_ACTION_TOKEN_MIN = 151669  # how can we know this range? check how you add fast tokens into VLM
+_ACTION_TOKEN_MAX = 153716  # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+
+
+import torch.nn as nn
+
+
+class _QWen3_VL_Interface(nn.Module):
+    """
+    This exists because of the diversity of VLMs, so we encapsulate the changes here.
+    Lightweight wrapper around Qwen3-VL (Qwen3VLForConditionalGeneration).
+
+    Purpose:
+        - Unify interface with other VLM backends (CausalLM-like usage).
+        - Centralize preprocessing (tokenization + multimodal packing).
+        - Provide consistent forward / generate signatures.
+
+    """
+
+    def __init__(self, config: dict | None = None, **kwargs):
+        """
+        Initialize the Qwen3-VL wrapper.
+        Following https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct
+
+        """
+        super().__init__()
+
+        qwenvl_config = config.model.qwenvl
+        model_id = qwenvl_config.get("base_vlm", "Qwen/Qwen3-VL-4B-Instruct")
+
+        model = Qwen3VLForConditionalGeneration.from_pretrained(
+            model_id,
+            attn_implementation="flash_attention_2",
+            dtype=torch.bfloat16,
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+        processor.tokenizer.padding_side = "left"
+
+        self.model = model
+        self.processor = processor
+        self.config = config
+
+        # alin qwen3 with qwen2.5
+        self.model.config.hidden_size = self.model.config.text_config.hidden_size
+
+        # only for fast base model
+        if "-Action" in model_id:
+            self._ACTION_TOKEN_MIN = _ACTION_TOKEN_MIN
+            self._ACTION_TOKEN_MAX = _ACTION_TOKEN_MAX
+
+    def forward(
+        self,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """
+        Forward pass delegating to underlying Qwen2.5-VL backbone.
+        """
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            outputs = self.model(
+                **kwargs,
+            )
+
+        return outputs
+
+    def generate(
+        self,
+        **kwargs,
+    ):
+        """
+        High-level generation interface (auto-regressive decoding), optionally vision-conditioned.
+
+        Args:
+            **kwargs: fully follow raw model.generate() signature.
+        Returns:
+            GenerateOutput | Model-dependent generation return.
+        """
+        with torch.autocast("cuda", dtype=torch.float16):
+            generation_output = self.model.generate(
+                **kwargs,
+            )
+        return generation_output
+
+    def build_qwenvl_inputs(
+        self,
+        examples,
+        images=None,
+        instructions=None,
+        image_keys=None,
+        solutions=None,
+        **kwargs,
+    ):
+        """
+        Build model inputs from raw data (images + instructions + optional solutions).
+        Follow Official Qwen3-VL Instruct format: https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct
+        """
+
+        # TODO: (yupu) This is so ugly, we should find a better way to handle this.
+        def _tensor_to_pil_list(batch_tensor):
+            if not isinstance(batch_tensor, torch.Tensor):
+                return batch_tensor
+            if batch_tensor.ndim == 3:
+                batch_tensor = batch_tensor.unsqueeze(0)
+            if batch_tensor.ndim != 4:
+                raise ValueError(f"Expected image tensor with 4 dims, got {batch_tensor.shape}")
+            pil_images = []
+            for item in batch_tensor:
+                if item.shape[-1] in (1, 3, 4):
+                    img = item
+                else:
+                    img = item.permute(1, 2, 0)
+                pil_images.append(to_pil_preserve(img.detach().cpu().numpy()))
+            return pil_images
+
+        if examples is not None and (images is None or instructions is None):
+            instructions = examples["task"]
+            if isinstance(instructions, torch.Tensor):
+                instructions = instructions.detach().cpu().tolist()
+            if isinstance(instructions, str):
+                instructions = [instructions]
+
+            # image_keys = image_keys or [
+            #     "observation.images.image",
+            #     "observation.images.wrist_image",
+            #     "observation.images.camera0",
+            #     "observation.images.camera1",
+            #     "image",
+            # ]
+
+            batch_images = None
+            for key in image_keys:
+                # if key not in examples:
+                #     continue
+                key_images = _tensor_to_pil_list(examples[key])
+                if batch_images is None:
+                    batch_images = [[img] for img in key_images]
+                else:
+                    for sample_images, img in zip(batch_images, key_images):
+                        sample_images.append(img)
+
+            # if batch_images is None:
+            #     batch_images = [[] for _ in range(len(instructions))]
+
+            # for key_idx, key in enumerate(image_keys):
+            #     pad_key = f"{key}_is_pad"
+            #     if pad_key in examples and isinstance(examples[pad_key], torch.Tensor):
+            #         pad_mask = examples[pad_key].detach().cpu().bool().view(-1)
+            #         for idx, is_pad in enumerate(pad_mask.tolist()):
+            #             if is_pad and key_idx < len(batch_images[idx]):
+            #                 batch_images[idx][key_idx] = None
+            for idx, sample_images in enumerate(batch_images):
+                batch_images[idx] = [img for img in sample_images if img is not None]
+
+            images = batch_images
+
+        # Create messages: one message per sample
+        messages = []
+        assert len(images) == len(instructions), "Images and instructions must have the same length"
+        for imgs, instruction in zip(images, instructions):
+            content = [{"type": "image", "image": img} for img in imgs]
+
+            if "CoT_prompt" in self.config.data.vla_data:  # If using a grounding prompt to task
+                CoT_prompt = self.config.data.vla_data.get("CoT_prompt", "")
+                prompt = CoT_prompt.replace("{instruction}", instruction)
+            else:
+                prompt = instruction
+
+            content.append({"type": "text", "text": prompt})
+            msg = [{"role": "user", "content": content}]
+
+            if solutions is not None:
+                solution = solutions[len(messages)]
+                msg.append({"role": "assistant", "content": [{"type": "text", "text": solution}]})
+            messages.append(msg)
+
+        # Preparation for inference
+
+        batch_inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            padding=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+
+        # if solutions, mask out the solution tokens in labels
+        if solutions is not None:  #  here only for fast_tokenizer now.
+            action_token_min = _ACTION_TOKEN_MIN  # how can we know this range? --> we has other way for this, but is slower see qwenhelix branch
+            action_token_max = _ACTION_TOKEN_MAX  # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+            labels = batch_inputs["input_ids"].clone()
+            # For each sequence in the batch, find the first occurrence of an action token.
+            for i in range(labels.size(0)):
+                seq = labels[i]
+                # Create a mask for tokens within the action token range.
+                mask_seq = (seq >= action_token_min) & (seq <= action_token_max)
+                nonzero_indices = torch.nonzero(mask_seq, as_tuple=False)
+                if nonzero_indices.numel() > 0:
+                    first_action_index = nonzero_indices[0].item()
+                    # Mask out all tokens before the first action token.
+                    seq[:first_action_index] = IGNORE_INDEX
+                else:
+                    # If no action token is found, mask the entire sequence.
+                    seq[:] = IGNORE_INDEX
+                    RuntimeWarning(
+                        "action token are on in your tokenizer, plz see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md."
+                    )
+
+            labels[
+                labels == self.processor.tokenizer.pad_token_id
+            ] = -100  ## mask out pad tokens as well
+            batch_inputs["labels"] = labels
+
+        return batch_inputs.to(self.model.device)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    import debugpy
+    from omegaconf import OmegaConf
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_yaml",
+        type=str,
+        default="./starVLA/config/training/starvla_cotrain_oxe.yaml",
+        help="Path to YAML config",
+    )
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+
+    cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Qwen3-VL-4B-Instruct"
+    qwen_vl = _QWen3_VL_Interface(cfg)
+    pass
diff --git a/flagscale/serve/msgpack_numpy.py b/flagscale/serve/msgpack_numpy.py
new file mode 100644
index 0000000000..007f755edf
--- /dev/null
+++ b/flagscale/serve/msgpack_numpy.py
@@ -0,0 +1,57 @@
+"""Adds NumPy array support to msgpack.
+
+msgpack is good for (de)serializing data over a network for multiple reasons:
+- msgpack is secure (as opposed to pickle/dill/etc which allow for arbitrary code execution)
+- msgpack is widely used and has good cross-language support
+- msgpack does not require a schema (as opposed to protobuf/flatbuffers/etc) which is convenient in dynamically typed
+    languages like Python and JavaScript
+- msgpack is fast and efficient (as opposed to readable formats like JSON/YAML/etc); I found that msgpack was ~4x faster
+    than pickle for serializing large arrays using the below strategy
+
+The code below is adapted from https://github.com/lebedov/msgpack-numpy. The reason not to use that library directly is
+that it falls back to pickle for object arrays.
+"""
+
+import functools
+
+import msgpack
+import numpy as np
+
+
+def pack_array(obj):
+    if (isinstance(obj, (np.ndarray, np.generic))) and obj.dtype.kind in ("V", "O", "c"):
+        raise ValueError(f"Unsupported dtype: {obj.dtype}")
+
+    if isinstance(obj, np.ndarray):
+        return {
+            b"__ndarray__": True,
+            b"data": obj.tobytes(),
+            b"dtype": obj.dtype.str,
+            b"shape": obj.shape,
+        }
+
+    if isinstance(obj, np.generic):
+        return {
+            b"__npgeneric__": True,
+            b"data": obj.item(),
+            b"dtype": obj.dtype.str,
+        }
+
+    return obj
+
+
+def unpack_array(obj):
+    if b"__ndarray__" in obj:
+        return np.ndarray(buffer=obj[b"data"], dtype=np.dtype(obj[b"dtype"]), shape=obj[b"shape"])
+
+    if b"__npgeneric__" in obj:
+        return np.dtype(obj[b"dtype"]).type(obj[b"data"])
+
+    return obj
+
+
+Packer = functools.partial(msgpack.Packer, default=pack_array)
+packb = functools.partial(msgpack.packb, default=pack_array)
+
+Unpacker = functools.partial(msgpack.Unpacker, object_hook=unpack_array)
+unpackb = functools.partial(msgpack.unpackb, object_hook=unpack_array)
diff --git a/flagscale/serve/run_serve_qwen_gr00t.py b/flagscale/serve/run_serve_qwen_gr00t.py
new file mode 100644
index 0000000000..18fd45b6d8
--- /dev/null
+++ b/flagscale/serve/run_serve_qwen_gr00t.py
@@ -0,0 +1,89 @@
+import argparse
+import importlib
+import time
+
+import torch
+from omegaconf import DictConfig, ListConfig, OmegaConf
+
+from flagscale.logger import logger
+from flagscale.models.utils.constants import ACTION
+from flagscale.serve.websocket_policy_server import WebsocketPolicyServer
+from flagscale.train.utils.train_utils import load_checkpoint
+
+
+class Policy:
+    def __init__(self, config: DictConfig | ListConfig):
+        self.config_engine = config["engine_args"]
+
+        self.host = self.config_engine.get("host", "0.0.0.0")
+        self.port = self.config_engine.get("port", 5000)
+        self.model = None
+        self.preprocessor = None
+        self.postprocessor = None
+
+        self.load_model()
+
+    def load_model(self):
+        t_s = time.perf_counter()
+        model_variant = self.config_engine.model_variant
+        policy = getattr(importlib.import_module("flagscale.models.vla"), model_variant)
+        self.model, self.preprocessor, self.postprocessor = load_checkpoint(
+            self.config_engine.model, policy, self.config_engine.device
+        )
+        # TODO: (yupu): model.to(dtype)?
+        logger.info(f"Policy model loading latency: {time.perf_counter() - t_s:.2f}s")
+
+    def infer(self, batch):
+        # FIXME: image reisze
+        logger.info("Start to inference")
+        print(f"batch: {batch}")
+        # TODO: (yupu) remove hard-code
+        batch = batch["examples"][0]
+        for k, v in batch.items():
+            if "image" in k:
+                print(f"{k}: type {type(v)} shape {v.shape}")
+        batch = self.preprocessor(batch)
+
+        with torch.no_grad():
+            action = self.model.predict_action(batch)
+            logger.info(f"action before postprocessor: {action}")
+
+        logger.info("Applying postprocessor...")
+        action = self.postprocessor(action)
+
+        # Convert to numpy for msgpack serialization
+        action[ACTION] = action[ACTION].detach().cpu().numpy()
+
+        return action
+
+
+def parse_config() -> DictConfig | ListConfig:
+    """Parse the configuration file"""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config-path", type=str, required=True, help="Path to the configuration YAML file"
+    )
+    parser.add_argument("--log-dir", type=str, required=True, help="Path to the log")
+    args = parser.parse_args()
+    config = OmegaConf.load(args.config_path)
+    return config
+
+
+def main(config):
+    policy = Policy(config)
+    logger.info("Done")
+    # start websocket server
+    server = WebsocketPolicyServer(
+        policy=policy,
+        host=policy.host,
+        port=policy.port,
+        metadata={"env": "simpler_env"},
+    )
+    logger.info("server running ...")
+    server.serve_forever()
+
+
+if __name__ == "__main__":
+    parsed_cfg = parse_config()
+    main(parsed_cfg["serve"][0])
diff --git a/flagscale/serve/websocket_policy_server.py b/flagscale/serve/websocket_policy_server.py
new file mode 100644
index 0000000000..e2cf5e5978
--- /dev/null
+++ b/flagscale/serve/websocket_policy_server.py
@@ -0,0 +1,93 @@
+import asyncio
+import http
+import time
+import traceback
+from typing import Protocol, runtime_checkable
+
+import websockets.asyncio.server as _server
+import websockets.frames
+from websockets.http11 import Request, Response
+
+from . import msgpack_numpy
+from flagscale.logger import logger
+
+
+@runtime_checkable
+class Policy(Protocol):
+    def infer(self, obs: dict) -> dict: ...
+
+
+class WebsocketPolicyServer:
+    """Serves a policy over websocket for evaluation inference.
+
+    Protocol:
+      1. On connect, server sends metadata dict to client.
+      2. Client sends msgpack-encoded obs dict, server returns msgpack-encoded action dict.
+      3. Each response includes a "server_timing" key with latency info.
+    """
+
+    def __init__(
+        self,
+        policy: Policy,
+        host: str = "0.0.0.0",
+        port: int = 10093,
+        metadata: dict | None = None,
+    ) -> None:
+        self._policy = policy
+        self._host = host
+        self._port = port
+        self._metadata = metadata or {}
+
+    def serve_forever(self) -> None:
+        asyncio.run(self.run())
+
+    async def run(self) -> None:
+        async with _server.serve(
+            self._handler,
+            self._host,
+            self._port,
+            compression=None,
+            max_size=None,
+            process_request=_health_check,
+        ) as server:
+            await server.serve_forever()
+
+    async def _handler(self, websocket: _server.ServerConnection) -> None:
+        logger.info(f"Connection from {websocket.remote_address} opened")
+        packer = msgpack_numpy.Packer()
+
+        await websocket.send(packer.pack(self._metadata))
+
+        prev_total_time: float | None = None
+        while True:
+            try:
+                start_time = time.monotonic()
+                obs: dict = msgpack_numpy.unpackb(await websocket.recv())
+
+                infer_time = time.monotonic()
+                action: dict = self._policy.infer(obs)
+                infer_time = time.monotonic() - infer_time
+
+                action["server_timing"] = {"infer_ms": infer_time * 1000}
+                if prev_total_time is not None:
+                    action["server_timing"]["prev_total_ms"] = prev_total_time * 1000
+
+                await websocket.send(packer.pack(action))
+                prev_total_time = time.monotonic() - start_time
+
+            except websockets.ConnectionClosed:
+                logger.info(f"Connection from {websocket.remote_address} closed")
+                break
+            except Exception:
+                await websocket.send(traceback.format_exc())
+                await websocket.close(
+                    code=websockets.frames.CloseCode.INTERNAL_ERROR,
+                    reason="Internal server error. Traceback included in previous frame.",
+                )
+                raise
+
+
+def _health_check(connection: _server.ServerConnection, request: Request) -> Response | None:
+    if request.path == "/healthz":
+        return connection.respond(http.HTTPStatus.OK, "OK\n")
+    return None
diff --git a/flagscale/train/datasets/lerobot_dataset.py b/flagscale/train/datasets/lerobot_dataset.py
index f3ed6611d1..0fa02a0641 100644
--- a/flagscale/train/datasets/lerobot_dataset.py
+++ b/flagscale/train/datasets/lerobot_dataset.py
@@ -509,7 +509,7 @@ def __repr__(self):
         feature_keys = list(self.features)
         return (
             f"{self.__class__.__name__}({{\n"
-            f"    Repository ID: '{self.repo_id}',\n"
+            f"    Root: '{self.root}',\n"
             f"    Total episodes: '{self.total_episodes}',\n"
             f"    Total frames: '{self.total_frames}',\n"
             f"    Features: '{feature_keys}',\n"
@@ -1063,6 +1063,7 @@ def __getitem__(self, idx) -> dict:
         self._ensure_hf_dataset_loaded()
         item = self.hf_dataset[idx]
         ep_idx = item["episode_index"].item()
+        print(f"idx: {idx}, ep_idx: {ep_idx}")
 
         query_indices = None
         if self.delta_indices is not None:
@@ -1075,6 +1076,7 @@ def __getitem__(self, idx) -> dict:
         if len(self.meta.video_keys) > 0:
             current_ts = item["timestamp"].item()
             query_timestamps = self._get_query_timestamps(current_ts, query_indices)
+            print(f"query_timestamps: {query_timestamps}")
             video_frames = self._query_videos(query_timestamps, ep_idx)
             item = {**video_frames, **item}
 
@@ -1086,6 +1088,15 @@ def __getitem__(self, idx) -> dict:
         # Add task as a string
         task_idx = item["task_index"].item()
         item["task"] = self.meta.tasks.iloc[task_idx].name
+
+        # DEBUG: print raw action from dataset
+        if "action" in item:
+            action_val = item["action"]
+            if hasattr(action_val, "numpy"):
+                action_val = action_val.numpy()
+            print(
+                f"[FS dataset] action[0,:5]: {action_val[0, :5].tolist() if len(action_val.shape) > 1 else action_val[:5].tolist()}"
+            )
         return item
 
     def __repr__(self):
diff --git a/flagscale/train/datasets/video_utils.py b/flagscale/train/datasets/video_utils.py
index 0cd45e8efb..53cd6c5293 100644
--- a/flagscale/train/datasets/video_utils.py
+++ b/flagscale/train/datasets/video_utils.py
@@ -27,7 +27,6 @@
 from typing import Any, ClassVar
 
 import av
-import fsspec
 import pyarrow as pa
 import torch
 import torchvision
@@ -66,6 +65,8 @@ def decode_video_frames(
         backend = get_safe_default_codec()
     if backend == "torchcodec":
         return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s)
+    elif backend == "torchvision_av":
+        return decode_video_frames_torchvision_av(video_path, timestamps, tolerance_s)
     elif backend in ["pyav", "video_reader"]:
         return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
     else:
@@ -170,6 +171,53 @@ def decode_video_frames_torchvision(
     return closest_frames
 
 
+def decode_video_frames_torchvision_av(
+    video_path: Path | str,
+    timestamps: list[float],
+    tolerance_s: float,
+) -> torch.Tensor:
+    """Match starVLA torchvision_av behavior: seek per timestamp and pick closest frame."""
+    video_path = str(video_path)
+    torchvision.set_video_backend("pyav")
+
+    loaded_frames = []
+    loaded_ts = []
+    reader = None
+    try:
+        reader = torchvision.io.VideoReader(video_path, "video")
+        for target_ts in timestamps:
+            reader.seek(target_ts, keyframes_only=True)
+            closest_frame = None
+            closest_diff = float("inf")
+            for frame in reader:
+                current_ts = frame["pts"]
+                current_diff = abs(current_ts - target_ts)
+                if current_diff < closest_diff:
+                    closest_diff = current_diff
+                    closest_frame = frame
+                else:
+                    break
+            if closest_frame is None:
+                raise ValueError(f"No frame found for timestamp {target_ts}")
+            frame_data = closest_frame["data"]
+            loaded_frames.append(frame_data)
+            loaded_ts.append(closest_frame["pts"])
+    finally:
+        if reader is not None and hasattr(reader, "container"):
+            reader.container.close()
+
+    frames = torch.stack(
+        [f if isinstance(f, torch.Tensor) else torch.as_tensor(f) for f in loaded_frames]
+    )
+    if tolerance_s is not None:
+        diff = (torch.tensor(loaded_ts) - torch.tensor(timestamps)).abs()
+        assert (diff < tolerance_s).all(), (
+            f"Loaded timestamps exceed tolerance: {diff.max()} > {tolerance_s}"
+        )
+    frames = frames.type(torch.float32) / 255
+    return frames
+
+
 class VideoDecoderCache:
     """Thread-safe cache for video decoders to avoid expensive re-initialization."""
 
@@ -188,9 +236,8 @@ def get_decoder(self, video_path: str):
 
         with self._lock:
             if video_path not in self._cache:
-                file_handle = fsspec.open(video_path).__enter__()
-                decoder = VideoDecoder(file_handle, seek_mode="approximate")
-                self._cache[video_path] = (decoder, file_handle)
+                decoder = VideoDecoder(video_path, seek_mode="approximate")
+                self._cache[video_path] = (decoder, None)
 
             return self._cache[video_path][0]
 
@@ -198,7 +245,8 @@ def clear(self):
         """Clear the cache and close file handles."""
         with self._lock:
             for _, file_handle in self._cache.values():
-                file_handle.close()
+                if file_handle is not None:
+                    file_handle.close()
             self._cache.clear()
 
     def size(self) -> int:
diff --git a/flagscale/train/processor/normalize_processor.py b/flagscale/train/processor/normalize_processor.py
index 6ca5145bb1..d5101bf759 100644
--- a/flagscale/train/processor/normalize_processor.py
+++ b/flagscale/train/processor/normalize_processor.py
@@ -101,6 +101,7 @@ class _NormalizationMixin:
     dtype: torch.dtype | None = None
     eps: float = 1e-8
     normalize_observation_keys: set[str] | None = None
+    normalize_action_dims: int | None = None
 
     _tensor_stats: dict[str, dict[str, Tensor]] = field(
         default_factory=dict, init=False, repr=False
@@ -249,6 +250,8 @@ def get_config(self) -> dict[str, Any]:
         }
         if self.normalize_observation_keys is not None:
             config["normalize_observation_keys"] = sorted(self.normalize_observation_keys)
+        if self.normalize_action_dims is not None:
+            config["normalize_action_dims"] = self.normalize_action_dims
         return config
 
     def _normalize_observation(
@@ -294,6 +297,11 @@ def _normalize_action(self, action: Tensor, inverse: bool) -> Tensor:
         processed_action = self._apply_transform(
             action, ACTION, FeatureType.ACTION, inverse=inverse
         )
+        if self.normalize_action_dims is not None:
+            d = self.normalize_action_dims
+            result = action.clone()
+            result[..., :d] = processed_action[..., :d]
+            return result
         return processed_action
 
     def _apply_transform(
diff --git a/flagscale/train/train_config.py b/flagscale/train/train_config.py
index ec35b4b141..5c386b64bd 100644
--- a/flagscale/train/train_config.py
+++ b/flagscale/train/train_config.py
@@ -4,24 +4,117 @@
 
 from typing import Any
 
-from omegaconf import OmegaConf
+from omegaconf import DictConfig, OmegaConf
 from pydantic import BaseModel, Field, field_validator
 
 
+class FreezeConfig(BaseModel):
+    """Pattern-based module freezing configuration (NeMo-style).
+
+    Freezing logic:
+    1. For each parameter, check if name matches any `freeze_patterns`
+    2. If matched, check if name also matches any `keep_patterns`
+    3. If matched by freeze but NOT by keep → freeze (requires_grad=False)
+
+    `keep_patterns` overrides `freeze_patterns` - this allows freezing a module
+    but keeping specific sub-components trainable.
+
+    Patterns are regex patterns matched against full parameter names.
+    """
+
+    model_config = {"extra": "allow"}
+
+    freeze_patterns: list[str] | None = None
+    keep_patterns: list[str] | None = None
+
+
 class OptimizerConfig(BaseModel):
-    """Optimizer configuration"""
+    """Optimizer configuration.
+
+    Attributes:
+        name: Optimizer class name. Currently supported: "AdamW".
+        lr: Learning rate (default for all param groups).
+        betas: Adam beta coefficients (beta1, beta2).
+        eps: Adam epsilon for numerical stability.
+        weight_decay: Weight decay (L2 penalty).
+        param_groups: Per-module optimizer overrides. Maps module paths to optimizer kwargs.
+            Example: {"encoder": {"lr": 1e-5}, "decoder": {"lr": 1e-3}}
+
+    Example config (YAML):
+        optimizer:
+          name: AdamW
+          lr: 1e-4
+          weight_decay: 0.01
+          param_groups:
+            vision_encoder:
+              lr: 1e-5
+            action_head:
+              lr: 2e-4
+    """
 
     name: str = "AdamW"
-    lr: float = 2.5e-5
-    betas: tuple[float, float] = (0.9, 0.95)
-    eps: float = 1e-8
-    weight_decay: float = 0.01
+    lr: float | None = None
+    betas: tuple[float, float] | None = None
+    eps: float | None = None
+    weight_decay: float | None = None
+    param_groups: dict[str, dict[str, Any]] | None = Field(
+        default=None,
+        description="Per-module optimizer settings. Maps module paths to optimizer kwargs.",
+    )
+
+    @field_validator("betas", mode="before")
+    @classmethod
+    def normalize_betas(cls, v):
+        """Convert list to tuple for betas if provided.
+
+        Accepts both list and tuple inputs, but always stores as tuple.
+        """
+        if v is None:
+            return None
+        if isinstance(v, list):
+            return tuple(v)
+        return v
+
+    def get_optimizer_kwargs(self) -> dict[str, Any]:
+        """Get non-None optimizer kwargs for passing to optimizer.
+
+        Returns:
+            Dict of optimizer kwargs, excluding None values.
+        """
+        return {
+            k: v
+            for k, v in {
+                "lr": self.lr,
+                "betas": self.betas,
+                "eps": self.eps,
+                "weight_decay": self.weight_decay,
+            }.items()
+            if v is not None
+        }
 
 
 class SchedulerConfig(BaseModel):
-    """Learning rate scheduler configuration"""
+    """Learning rate scheduler configuration.
 
+    Uses transformers scheduler types when `name` is set. See transformers.SchedulerType for options:
+    linear, cosine, cosine_with_restarts, polynomial, constant,
+    constant_with_warmup, inverse_sqrt, cosine_with_min_lr, etc.
+
+    Example:
+        scheduler:
+          name: cosine
+          warmup_steps: 1000
+          scheduler_kwargs:
+            min_lr: 1e-6
+
+    For backward compatibility with pi0/pi0.5, the legacy fields (decay_steps, decay_lr) are kept.
+    """
+
+    name: str | None = None
     warmup_steps: int = 1000
+    scheduler_kwargs: dict[str, Any] | None = None
+
+    # Legacy fields for pi0/pi0.5 backward compatibility
     decay_steps: int = 30000
     decay_lr: float = 2.5e-6
 
@@ -37,6 +130,8 @@ class CheckpointConfig(BaseModel):
 class SystemConfig(BaseModel):
     """Training loop configuration"""
 
+    model_config = {"extra": "allow", "arbitrary_types_allowed": True}
+
     batch_size: int = 1
     train_steps: int = 100000
     log_freq: int = 10
@@ -48,16 +143,32 @@ class SystemConfig(BaseModel):
     optimizer: OptimizerConfig
     scheduler: SchedulerConfig
     checkpoint: CheckpointConfig
+    raw: DictConfig | None = Field(default=None, exclude=True)
+
+    def __getattr__(self, name):
+        raw = self.__dict__.get("raw")
+        if raw is not None and hasattr(raw, name):
+            return getattr(raw, name)
+        raise AttributeError(name)
 
 
 class DataConfig(BaseModel):
     """Dataset configuration"""
 
+    model_config = {"extra": "allow", "arbitrary_types_allowed": True}
+
     data_path: str = Field(..., description="Path to training dataset")
     tolerance_s: float = 0.0001
     use_imagenet_stats: bool = True
     rename_map: dict[str, str] | None = None
     use_quantiles: bool = False
+    raw: DictConfig | None = Field(default=None, exclude=True)
+
+    def __getattr__(self, name):
+        raw = self.__dict__.get("raw")
+        if raw is not None and hasattr(raw, name):
+            return getattr(raw, name)
+        raise AttributeError(name)
 
 
 class ModelConfig(BaseModel):
@@ -72,22 +183,33 @@ class ModelConfig(BaseModel):
     All other fields are passed through to the model's config class.
     """
 
-    model_config = {"extra": "allow"}  # Allow extra fields for model-specific config
+    model_config = {
+        "extra": "allow",
+        "arbitrary_types_allowed": True,
+    }  # Allow extra fields for model-specific config
 
     # Required fields to identify which model and checkpoint to use
     model_name: str = Field(..., description="Model name: 'pi0' or 'pi0.5'")
     checkpoint_dir: str = Field(..., description="Path to pretrained model checkpoint")
-
-    @field_validator("model_name")
-    @classmethod
-    def validate_model_name(cls, v):
-        if v not in ["pi0", "pi0.5"]:
-            raise ValueError(f"Invalid model_name: {v}. Must be 'pi0' or 'pi0.5'")
-        return v
+    freeze: FreezeConfig | None = None
+    raw: DictConfig | None = Field(default=None, exclude=True)
+
+    def __getattr__(self, name):
+        raw = self.__dict__.get("raw")
+        if raw is not None and hasattr(raw, name):
+            return getattr(raw, name)
+        raise AttributeError(name)
+
+    # @field_validator("model_name")
+    # @classmethod
+    # def validate_model_name(cls, v):
+    #     if v not in ["pi0", "pi0.5"]:
+    #         raise ValueError(f"Invalid model_name: {v}. Must be 'pi0' or 'pi0.5'")
+    #     return v
 
     def get_model_config_dict(self) -> dict[str, Any]:
         """Get all model-specific config fields (excluding train-level fields)."""
-        return self.model_dump(exclude={"model_name", "checkpoint_dir"})
+        return self.model_dump(exclude={"model_name", "checkpoint_dir", "freeze"})
 
 
 class TrainConfig(BaseModel):
@@ -100,10 +222,13 @@ class TrainConfig(BaseModel):
     @classmethod
     def from_hydra_config(cls, hydra_config) -> "TrainConfig":
         """Convert Hydra DictConfig to Pydantic TrainConfig"""
-        train_dict = OmegaConf.to_container(hydra_config.train, resolve=True)
+        train = hydra_config.train
+        train_dict = OmegaConf.to_container(train, resolve=True)
+        train_dict["system"] = SystemConfig(**train_dict["system"], raw=train.system)
+        train_dict["data"] = DataConfig(**train_dict["data"], raw=train.data)
+        train_dict["model"] = ModelConfig(**train_dict["model"], raw=train.model)
         return cls(**train_dict)
 
     class Config:
         # Allow arbitrary types for complex objects
         arbitrary_types_allowed = True
-
diff --git a/flagscale/train/train_qwen_gr00t.py b/flagscale/train/train_qwen_gr00t.py
new file mode 100644
index 0000000000..4665759ee1
--- /dev/null
+++ b/flagscale/train/train_qwen_gr00t.py
@@ -0,0 +1,1227 @@
+# Mainly adopted from
+# https://github.com/huggingface/lerobot/blob/2b304eeb841ae6c371e3dd341bbbb9dd254b07cb/src/lerobot/scripts/lerobot_train.py
+
+import argparse
+import os
+import random
+import time
+from collections.abc import Iterator
+from contextlib import nullcontext
+from typing import Any, TypedDict
+
+try:
+    from typing import Unpack  # Python 3.11+
+except ImportError:
+    from typing_extensions import Unpack  # Python < 3.11
+
+from omegaconf import OmegaConf, DictConfig
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.optim import Optimizer
+# from torch.nn.parallel import DistributedDataParallel as DDP  # Commented out: using accelerate instead
+
+# Accelerate for distributed training (matching starVLA)
+from accelerate import Accelerator
+from accelerate.utils import DistributedDataParallelKwargs, set_seed as accelerate_set_seed
+
+from flagscale.runner.utils import logger
+from flagscale.train.train_config import TrainConfig, DataConfig
+from flagscale.train.datasets.transforms import ImageTransforms
+from flagscale.train.datasets.lerobot_dataset import (
+    LeRobotDataset,
+    LeRobotDatasetMetadata,
+)
+from flagscale.train.datasets.utils import dataset_to_policy_features
+from flagscale.train.processor import PolicyAction, PolicyProcessorPipeline
+from flagscale.train.processor.converters import (
+    batch_to_transition,
+    policy_action_to_transition,
+    transition_to_batch,
+    transition_to_policy_action,
+)
+from flagscale.models.utils.constants import (
+    POLICY_POSTPROCESSOR_DEFAULT_NAME,
+    POLICY_PREPROCESSOR_DEFAULT_NAME,
+)
+from flagscale.models.configs.types import PolicyFeature
+from flagscale.models.utils.constants import ACTION, OBS_PREFIX, REWARD
+from flagscale.models.configs.types import FeatureType
+from flagscale.train.utils.logging_utils import (
+    AverageMeter,
+    MetricsTracker,
+    format_big_number,
+)
+from flagscale.train.utils.train_utils import (
+    save_checkpoint,
+    get_step_checkpoint_dir,
+    update_last_checkpoint,
+)
+from flagscale.train.utils.optim_setup import setup_optimizer_and_scheduler
+from flagscale.models.vla.qwen_gr00t import QwenGr00t
+from flagscale.models.qwen_pi.qwen_pi import Qwen_PI
+
+IMAGENET_STATS = {
+    "mean": [[[0.485]], [[0.456]], [[0.406]]],  # (c,1,1)
+    "std": [[[0.229]], [[0.224]], [[0.225]]],  # (c,1,1)
+}
+
+from PIL import Image
+from torch.utils.data import Dataset as TorchDataset
+
+def collate_fn_starvla(batch):
+    """Simple collate function that returns batch as list of dicts (starVLA style)."""
+    return batch
+
+
+class StarVLAFormatDataset(TorchDataset):
+    """
+    Wrapper dataset that converts FlagScale tensor images to match starVLA format.
+
+    Conversion to match starVLA exactly:
+    1. FlagScale tensor: float32 CHW, [0,1] range
+    2. Convert to uint8 HWC: multiply by 255, permute, cast to uint8
+    3. PIL.fromarray + resize (same as starVLA)
+
+    starVLA format:
+        dict(
+            action=np.ndarray [T, action_dim],  # float16
+            image=[PIL.Image, ...],             # list of PIL images (224x224)
+            lang=str,                           # language instruction
+        )
+    """
+
+    def __init__(
+        self,
+        dataset: "LeRobotDataset",
+        image_keys: list[str] = None,
+        image_size: tuple[int, int] = (224, 224),
+    ):
+        self.dataset = dataset
+        self.image_keys = image_keys or [
+            "observation.images.image",
+            "observation.images.wrist_image",
+        ]
+        self.image_size = image_size
+
+        # Get action stats for min_max normalization (matching starVLA's StateActionTransform)
+        action_stats = dataset.meta.stats.get("action", {})
+        self.action_min = action_stats.get("min", None)
+        self.action_max = action_stats.get("max", None)
+        # Convert to numpy if needed
+        if self.action_min is not None and hasattr(self.action_min, 'numpy'):
+            self.action_min = self.action_min.numpy()
+        if self.action_max is not None and hasattr(self.action_max, 'numpy'):
+            self.action_max = self.action_max.numpy()
+
+        # Debug: print stats
+        print(f"[StarVLAFormatDataset] action_min: {self.action_min}")
+        print(f"[StarVLAFormatDataset] action_max: {self.action_max}")
+        self._debug_count = 0  # Counter for debug prints
+
+    def __len__(self):
+        return len(self.dataset)
+
+    @property
+    def num_frames(self):
+        return self.dataset.num_frames
+
+    @property
+    def num_episodes(self):
+        return self.dataset.num_episodes
+
+    def _tensor_to_pil_starvla(self, tensor: torch.Tensor) -> Image.Image:
+        """
+        Convert tensor to PIL exactly like starVLA:
+        1. tensor is float32 CHW [0,1] from torchcodec
+        2. Convert to uint8 HWC [0,255]
+        3. PIL.fromarray + resize
+        """
+        # Remove batch dim if present
+        if tensor.ndim == 4:
+            tensor = tensor[0]
+
+        # CHW -> HWC
+        if tensor.shape[0] in (1, 3, 4):
+            tensor = tensor.permute(1, 2, 0)
+
+        # float32 [0,1] -> uint8 [0,255]
+        img_np = (tensor.detach().cpu().numpy() * 255).astype(np.uint8)
+
+        # PIL.fromarray + resize (exactly like starVLA)
+        pil_img = Image.fromarray(img_np).resize(self.image_size)
+        return pil_img
+
+    def __getitem__(self, idx: int) -> dict:
+        item = self.dataset[idx]
+
+        # Convert images to PIL format (matching starVLA processing)
+        images = []
+        for key in self.image_keys:
+            if key in item:
+                pil_img = self._tensor_to_pil_starvla(item[key])
+                images.append(pil_img)
+
+        # Get action (convert to numpy float16 like starVLA)
+        action = item["action"]
+        if isinstance(action, torch.Tensor):
+            action = action.detach().cpu().numpy()
+
+        # Debug: print raw action values (only first few samples)
+        if self._debug_count < 16:
+            traj_id = item.get("episode_index", -1)
+            if isinstance(traj_id, torch.Tensor):
+                traj_id = traj_id.item()
+            frame_idx = item.get("index", idx)
+            if isinstance(frame_idx, torch.Tensor):
+                frame_idx = frame_idx.item()
+            print(f"[StarVLAFormatDataset] idx={idx} traj={traj_id} frame={frame_idx} RAW action[0,:5]: {action[0,:5].tolist()}")
+            print(f"[StarVLAFormatDataset] idx={idx} RAW action sum: {action.sum():.4f}")
+
+        # Apply min_max normalization (matching starVLA's Libero4in1DataConfig exactly)
+        # starVLA only normalizes action.x, y, z, roll, pitch, yaw (indices 0-5)
+        # action.gripper (index 6) is NOT normalized
+        # Formula: 2 * (x - min) / (max - min) - 1
+        if self.action_min is not None and self.action_max is not None:
+            # Only normalize first 6 dimensions (x, y, z, roll, pitch, yaw)
+            # Keep gripper (dim 6) as raw value
+            normalize_dims = 6  # Only normalize first 6 dims
+            action_range = self.action_max[:normalize_dims] - self.action_min[:normalize_dims]
+            mask = action_range > 1e-8
+
+            normalized = action.copy()
+            # Normalize dimensions 0-5 where range > 0
+            for i in range(normalize_dims):
+                if mask[i]:
+                    normalized[..., i] = (action[..., i] - self.action_min[i]) / action_range[i]
+                    normalized[..., i] = 2.0 * normalized[..., i] - 1.0
+                else:
+                    normalized[..., i] = 0.0
+            # Keep dimension 6 (gripper) as-is (no normalization)
+            action = normalized
+
+        # Debug: print normalized action values (only first few samples)
+        if self._debug_count < 16:
+            print(f"[StarVLAFormatDataset] idx={idx} NORM action[0,:5]: {action[0,:5].tolist()}")
+            print(f"[StarVLAFormatDataset] idx={idx} NORM action sum: {action.sum():.4f}")
+            self._debug_count += 1
+
+        action = action.astype(np.float16)
+
+        # Get language instruction
+        lang = item.get("task", "")
+        if isinstance(lang, torch.Tensor):
+            lang = lang.item() if lang.numel() == 1 else str(lang.tolist())
+
+        # Get trajectory_id and frame_index for debugging (matching starVLA format)
+        trajectory_id = item.get("episode_index", -1)
+        if isinstance(trajectory_id, torch.Tensor):
+            trajectory_id = trajectory_id.item()
+        frame_index = item.get("index", idx)
+        if isinstance(frame_index, torch.Tensor):
+            frame_index = frame_index.item()
+
+        return dict(
+            action=action,
+            image=images,
+            lang=lang,
+            trajectory_id=trajectory_id,
+            frame_index=frame_index,
+        )
+
+def register_debug_hooks(model_obj):
+    """
+    给模型挂载带有 Rank 信息的 Forward 和 Backward Hook
+    model_obj: 可以是 model (list) 也可以是 model[0] (module)
+    """
+    # 1. 获取 Rank 的辅助函数
+    def get_rank():
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            return torch.distributed.get_rank()
+        return 0
+    # 2. 通用打印函数
+    def calc_and_print(tensor, name, tag):
+        """
+        tensor: 要打印的张量
+        name: 模块名称 + 参数位置
+        tag: FWD 或 BWD
+        """
+        if tensor is None:
+            return
+        # 仅处理 Tensor，忽略 None 或其他类型
+        if isinstance(tensor, torch.Tensor):
+            # 获取当前 Rank
+            rank = get_rank()
+            # 计算 sum (转为 float32 防止溢出，item() 会触发同步确保数值准确)
+            # 注意：打印日志会显著降低训练速度，仅用于 Debug
+            val = torch.sum(tensor.detach().to(torch.float32)).item()
+            # 打印格式：[Rank 0][FWD] layers.0.self_attention sum: 1234.56
+            print(f"[Rank {rank}][{tag}] {name} sum: {val}", flush=True)
+    # 3. 前向 Hook 定义
+    def forward_wrapper(name):
+        def forward_hook(module, input, output):
+            # 打印 Input (元组或张量)
+            if isinstance(input, (list, tuple)):
+                for i, item in enumerate(input):
+                    calc_and_print(item, f"{name}.input[{i}]", "FWD")
+            else:
+                calc_and_print(input, f"{name}.input", "FWD")
+            # 打印 Output
+            if isinstance(output, (list, tuple)):
+                for i, item in enumerate(output):
+                    calc_and_print(item, f"{name}.output[{i}]", "FWD")
+            else:
+                calc_and_print(output, f"{name}.output", "FWD")
+        return forward_hook
+    # 4. 反向 Hook 定义 (使用 register_full_backward_hook)
+    def backward_wrapper(name):
+        def backward_hook(module, grad_input, grad_output):
+            # grad_output: 从上一层流回来的梯度 (反向传播的“输入”)
+            if isinstance(grad_output, (list, tuple)):
+                for i, g in enumerate(grad_output):
+                    calc_and_print(g, f"{name}.grad_output[{i}]", "BWD")
+            else:
+                calc_and_print(grad_output, f"{name}.grad_output", "BWD")
+            # grad_input: 当前层计算出的梯度 (准备传给下一层)
+            if isinstance(grad_input, (list, tuple)):
+                for i, g in enumerate(grad_input):
+                    calc_and_print(g, f"{name}.grad_input[{i}]", "BWD")
+            else:
+                calc_and_print(grad_input, f"{name}.grad_input", "BWD")
+        return backward_hook
+    # 5. 开始注册
+    # 兼容 list 结构
+    actual_module = model_obj[0] if isinstance(model_obj, list) else model_obj
+    print(f"Rank {get_rank()}: 开始挂载 Debug Hooks (仅叶子层)...", flush=True)
+    # 遍历所有子模块
+    for name, module in actual_module.named_modules():
+        # 【核心修改】跳过容器层，只Hook叶子层（没有子模块的层）
+        # 这样可以避免 Hook 顶层模块导致的 View 属性变化，同时也能覆盖所有计算
+        if len(list(module.children())) > 0:
+            continue
+        # 额外的黑名单（可选）：跳过一些不重要的层，比如 Dropout
+        if isinstance(module, torch.nn.Dropout):
+            continue
+        # 注册 FWD Hook
+        handle_fwd = module.register_forward_hook(forward_wrapper(name))
+        # 注册 BWD Hook
+        handle_bwd = module.register_full_backward_hook(backward_wrapper(name))
+def remove_debug_hooks_force(model_obj):
+    """
+    暴力清除模型中所有的 hook，不需要 handle。
+    """
+    actual_module = model_obj[0] if isinstance(model_obj, list) else model_obj
+    print("Force removing all hooks...", flush=True)
+    for module in actual_module.modules():
+        # 清除前向 hook
+        if hasattr(module, "_forward_hooks"):
+            module._forward_hooks.clear()
+        # 清除反向 hook
+        if hasattr(module, "_backward_hooks"):
+            module._backward_hooks.clear()
+    print("Hooks force removed.", flush=True)
+
+
+
+
+# Commented out: using accelerate's set_seed instead
+# def set_seed(seed: int):
+#     np.random.seed(seed)
+#     random.seed(seed)
+#     torch.manual_seed(seed)
+#     if torch.cuda.is_available():
+#         torch.cuda.manual_seed_all(seed)
+#
+#     torch.backends.cudnn.enabled = True
+#     torch.backends.cudnn.benchmark = True
+#     torch.backends.cudnn.deterministic = True
+#     torch.backends.cuda.matmul.allow_tf32 = True
+
+def set_seed(seed: int):
+    """Wrapper around accelerate's set_seed with additional cudnn settings."""
+    accelerate_set_seed(seed)
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = False
+    torch.backends.cuda.matmul.allow_tf32 = False
+
+
+# Commented out: using accelerate instead of manual DDP
+# def init_ddp():
+#     local_rank = int(os.environ["LOCAL_RANK"])
+#     torch.cuda.set_device(local_rank)
+#     torch.distributed.init_process_group(backend="nccl", init_method="env://")
+#     return local_rank
+
+# Initialize Accelerator at module level (matching starVLA)
+ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
+
+
+# TODO: (yupu) Re-enable wandb
+# def init_wandb(config, *, resuming: bool, log_code: bool = False, enabled: bool = True):
+#     if not enabled:
+#         wandb.init(mode="disabled")
+#         return
+
+#     ckpt_dir = pathlib.Path(config.checkpoint_dir)
+#     if not ckpt_dir.exists():
+#         raise FileNotFoundError(f"Checkpoint directory {ckpt_dir} does not exist.")
+#     if resuming:
+#         run_id = (ckpt_dir / "wandb_id.txt").read_text().strip()
+#         wandb.init(id=run_id, resume="must", project=config.project_name)
+#     else:
+#         wandb.init(
+#             name=config.exp_name, config=vars(config), project=config.project_name
+#         )
+#         (ckpt_dir / "wandb_id.txt").write_text(wandb.run.id)
+
+#     if log_code:
+#         wandb.run.log_code(epath.Path(__file__).parent.parent)
+
+
+def make_dataset(cfg: DataConfig):
+    # TODO: (yupu) Support image transforms
+    enable_image_transform = False
+    # TODO: (yupu) Remove hard-coded video backend
+    # After not much testing, It feels like that `torchcodec` is more robust than `pyav`
+    # `pyav` crashes sometimes
+    video_backend = "torchcodec"
+    # video_backend = "torchvision_av"
+    # video_backend = "pyav"
+
+    # image_transforms = ImageTransforms(cfg.image_transforms) if enable_image_transform else None
+
+    # Match starVLA: resize uint8 via PIL, then normalize to [0,1]
+    def _resize_like_starvla(frames: torch.Tensor) -> torch.Tensor:
+        if not isinstance(frames, torch.Tensor):
+            return frames
+        is_single = False
+        if frames.dim() == 3:
+            frames = frames.unsqueeze(0)
+            is_single = True
+        if frames.dim() != 4:
+            return frames
+        from PIL import Image
+        import numpy as np
+
+        resized_frames = []
+        for frame in frames:
+            channel_last = frame.shape[-1] in (1, 3, 4)
+            if channel_last:
+                frame_hwc = frame
+            elif frame.shape[0] in (1, 3, 4):
+                frame_hwc = frame.permute(1, 2, 0)
+            else:
+                frame_hwc = frame
+                channel_last = True
+            frame_uint8 = (frame_hwc * 255).round().clamp(0, 255).to(torch.uint8)
+            pil = Image.fromarray(frame_uint8.cpu().numpy()).resize(
+                (224, 224), resample=Image.BILINEAR
+            )
+            out = torch.from_numpy(np.array(pil)).to(frames.device).float() / 255.0
+            if not channel_last:
+                out = out.permute(2, 0, 1)
+            resized_frames.append(out)
+        output = torch.stack(resized_frames, dim=0)
+        return output[0] if is_single else output
+
+    image_transforms = _resize_like_starvla
+    # Leave the revision to None
+    ds_meta = LeRobotDatasetMetadata(root=cfg.data_path, revision=None)
+    delta_timestamps = resolve_delta_timestamps(cfg, ds_meta)
+
+    dataset = LeRobotDataset(
+        root=cfg.data_path,
+        episodes=None,
+        delta_timestamps=delta_timestamps,
+        image_transforms=image_transforms,
+        revision=None,
+        video_backend=video_backend,
+        tolerance_s=cfg.tolerance_s,
+    )
+
+    if cfg.use_imagenet_stats:
+        for key in dataset.meta.camera_keys:
+            for stats_type, stats in IMAGENET_STATS.items():
+                dataset.meta.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32)
+
+    return dataset
+
+
+def resolve_delta_timestamps(cfg: DataConfig, ds_meta: LeRobotDatasetMetadata) -> dict[str, list] | None:
+    """Resolves delta_timestamps by reading from the 'delta_indices' properties of the PreTrainedConfig.
+
+    Args:
+        cfg: The policy config (PI0Config or PI05Config) to read delta_indices from.
+        ds_meta (LeRobotDatasetMetadata): The dataset from which features and fps are used to build
+            delta_timestamps against.
+
+    Returns:
+        dict[str, list] | None: A dictionary of delta_timestamps, e.g.:
+            {
+                "observation.state": [-0.04, -0.02, 0]
+                "observation.action": [-0.02, 0, 0.02]
+            }
+            returns `None` if the resulting dict is empty.
+    """
+    delta_timestamps = {}
+    for key in ds_meta.features:
+        if key == REWARD and cfg.reward_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.reward_delta_indices]
+        if key == ACTION and cfg.action_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.action_delta_indices]
+        if key.startswith(OBS_PREFIX) and cfg.observation_delta_indices is not None:
+            delta_timestamps[key] = [i / ds_meta.fps for i in cfg.observation_delta_indices]
+
+    if len(delta_timestamps) == 0:
+        delta_timestamps = None
+
+    return delta_timestamps
+
+
+# datasets/utils.py
+def cycle(iterable: Any) -> Iterator[Any]:
+    """Create a dataloader-safe cyclical iterator.
+
+    This is an equivalent of `itertools.cycle` but is safe for use with
+    PyTorch DataLoaders with multiple workers.
+    See https://github.com/pytorch/pytorch/issues/23900 for details.
+
+    Args:
+        iterable: The iterable to cycle over.
+
+    Yields:
+        Items from the iterable, restarting from the beginning when exhausted.
+    """
+    iterator = iter(iterable)
+    while True:
+        try:
+            yield next(iterator)
+        except StopIteration:
+            iterator = iter(iterable)
+
+
+def raise_feature_mismatch_error(
+    provided_features: set[str],
+    expected_features: set[str],
+) -> None:
+    """
+    Raises a standardized ValueError for feature mismatches between dataset/environment and policy config.
+    """
+    missing = expected_features - provided_features
+    extra = provided_features - expected_features
+    # TODO (jadechoghari): provide a dynamic rename map suggestion to the user.
+    raise ValueError(
+        f"Feature mismatch between dataset/environment and policy config.\n"
+        f"- Missing features: {sorted(missing) if missing else 'None'}\n"
+        f"- Extra features: {sorted(extra) if extra else 'None'}\n\n"
+        f"Please ensure your dataset and policy use consistent feature names.\n"
+        f"If your dataset uses different observation keys (e.g., cameras named differently), "
+        f"use the `--rename_map` argument, for example:\n"
+        f'  --rename_map=\'{{"observation.images.left": "observation.images.camera1", '
+        f'"observation.images.top": "observation.images.camera2"}}\''
+    )
+
+
+def format_train_tracker_step(train_tracker: MetricsTracker) -> str:
+    def _format_meter_val(meter: AverageMeter) -> str:
+        fmt = meter.fmt[1:] if meter.fmt.startswith(":") else meter.fmt
+        return f"{meter.name}:{format(meter.val, fmt)}"
+
+    display_list = [
+        f"step:{format_big_number(train_tracker.steps)}",
+        f"smpl:{format_big_number(train_tracker.samples)}",
+        f"ep:{format_big_number(train_tracker.episodes)}",
+        f"epch:{train_tracker.epochs:.2f}",
+        *[_format_meter_val(m) for m in train_tracker.metrics.values()],
+    ]
+    return " ".join(display_list)
+
+
+# def validate_visual_features_consistency(
+#     cfg: PI0Config,
+#     features: dict[str, PolicyFeature],
+# ) -> None:
+#     """
+#     Validates visual feature consistency between a policy config and provided dataset/environment features.
+
+#     Args:
+#         cfg (PreTrainedConfig): The model or policy configuration containing input_features and type.
+#         features (Dict[str, PolicyFeature]): A mapping of feature names to PolicyFeature objects.
+#     """
+#     expected_visuals = {k for k, v in cfg.input_features.items() if v.type == FeatureType.VISUAL}
+#     provided_visuals = {k for k, v in features.items() if v.type == FeatureType.VISUAL}
+#     if not provided_visuals.issubset(expected_visuals):
+#         raise_feature_mismatch_error(provided_visuals, expected_visuals)
+
+
+def make_policy(
+    config: TrainConfig,
+    ds_meta: LeRobotDatasetMetadata | None = None,
+):
+    """
+    Instantiate a policy model.
+
+    This factory function handles the logic of creating a policy, which requires
+    determining the input and output feature shapes. These shapes can be derived
+    either from a `LeRobotDatasetMetadata` object or an `EnvConfig` object. The function
+    can either initialize a new policy from scratch or load a pretrained one.
+
+    Args:
+        cfg: The configuration for the policy to be created (PI0Config or PI05Config).
+             If `cfg.pretrained_path` is set, the policy will be loaded with weights from that path.
+        ds_meta: Dataset metadata used to infer feature shapes and types. Also provides
+                 statistics for normalization layers.
+        rename_map: Optional mapping of dataset or environment feature keys to match
+                 expected policy feature names (e.g., `"left"` → `"camera1"`).
+        model_variant: Model variant to use, either "pi0" or "pi0.5".
+
+    Returns:
+        An instantiated and device-placed policy model (PI0Policy or PI05Policy).
+    """
+
+    # # Select policy class based on model variant
+    # if model_variant == "pi0.5":
+    #     policy_cls = PI05Policy
+    # else:
+    #     policy_cls = PI0Policy
+
+    kwargs = {}
+    features = dataset_to_policy_features(ds_meta.features)
+
+    # FIXME
+    output_features = {
+        # Changed from ft.type is FeatureType.ACTION to ft.type == FeatureType.ACTION
+        # for different enum classes: flagscale.FeatureType vs lerobot.FeatureType
+        key: ft
+        for key, ft in features.items()
+        if ft.type == FeatureType.ACTION
+    }
+    input_features = {
+        key: ft for key, ft in features.items() if key not in output_features
+    }
+    # kwargs["config"] = config.model
+
+    # PI0 finetuning, so always load a pretrained policy.
+    # Load a pretrained policy and override the config if needed (for example, if there are inference-time
+    # hyperparameters that we want to vary).
+    # kwargs["pretrained_name_or_path"] = cfg.pretrained_path
+    # policy = policy_cls.from_pretrained(cfg.pretrained_path, config=cfg)
+
+    # TODO: (yupu) This is a hack, we should find a better way to handle this. LeRobot does this in the policy config.
+    # The order of the images is defined in the dataset config.json
+    image_features = {key: ft for key, ft in input_features.items() if ft.type is FeatureType.VISUAL}
+    config.data.vla_data.image_features = image_features
+
+    policy = QwenGr00t(config=config)
+    # policy = Qwen_PI(config=config)
+    print(policy)
+    print(f"config: {config}")
+
+    # FIXME
+    policy.to("cuda")
+
+    return policy, input_features, output_features
+
+
+class ProcessorConfigKwargs(TypedDict, total=False):
+    """
+    A TypedDict defining the keyword arguments for processor configuration.
+
+    This provides type hints for the optional arguments passed to `make_pre_post_processors`,
+    improving code clarity and enabling static analysis.
+
+    Attributes:
+        preprocessor_config_filename: The filename for the preprocessor configuration.
+        postprocessor_config_filename: The filename for the postprocessor configuration.
+        preprocessor_overrides: A dictionary of overrides for the preprocessor configuration.
+        postprocessor_overrides: A dictionary of overrides for the postprocessor configuration.
+        dataset_stats: Dataset statistics for normalization.
+    """
+
+    preprocessor_config_filename: str | None
+    postprocessor_config_filename: str | None
+    preprocessor_overrides: dict[str, Any] | None
+    postprocessor_overrides: dict[str, Any] | None
+    dataset_stats: dict[str, dict[str, torch.Tensor]] | None
+
+
+def make_preprocessor_from_config(
+    config: dict[str, Any] | list[str | dict[str, Any]],
+    overrides: dict[str, Any] | None = None,
+) -> PolicyProcessorPipeline[dict[str, Any], dict[str, Any]]:
+    """
+    Create a preprocessor pipeline from step configurations with optional overrides.
+
+    This function creates a PolicyProcessorPipeline directly from step configurations,
+    without requiring a pretrained path. It supports overriding step configurations
+    similar to PolicyProcessorPipeline.from_pretrained().
+
+    Args:
+        config: Can be either:
+            - A dict with "name" and "steps" fields (JSON format):
+              {"name": "policy_preprocessor", "steps": [...]}
+            - A list of step configurations (concise format):
+              ["step_name", {"step_name": {...}}]
+        overrides: Optional dictionary to override step configurations. Keys should
+            match the step's registry_name. Example:
+            {"device_processor": {"device": "cuda"},
+             "normalizer_processor": {"stats": dataset.meta.stats}}
+
+    Returns:
+        A PolicyProcessorPipeline instance with the configured steps.
+
+    Example (JSON format with overrides):
+        ```python
+        config = {
+            "name": "policy_preprocessor",
+            "steps": [
+                {"registry_name": "device_processor", "config": {"device": "cpu"}},
+                {"registry_name": "normalizer_processor", "config": {"eps": 1e-8}}
+            ]
+        }
+        overrides = {
+            "device_processor": {"device": "cuda"},
+            "normalizer_processor": {"stats": dataset.meta.stats, "features": {...}}
+        }
+        preprocessor = make_preprocessor_from_config(config, overrides=overrides)
+        # device_processor will use device="cuda" (overridden)
+        # normalizer_processor will use eps=1e-8 (from config) and stats from overrides
+        ```
+
+    Example (concise list format):
+        ```python
+        steps = [
+            "rename_observations_processor",
+            "device_processor",
+            {"normalizer_processor": {"eps": 1e-8}}
+        ]
+        preprocessor = make_preprocessor_from_config(steps)
+        ```
+
+    Raises:
+        ValueError: If a step configuration is invalid or step cannot be instantiated.
+        KeyError: If a registry name is not found.
+    """
+    from flagscale.train.processor.pipeline import ProcessorStepRegistry
+
+    overrides = overrides or {}
+
+    # Determine format and extract step configs
+    if isinstance(config, (dict, DictConfig)) and "steps" in config:
+        # JSON format: {"name": "...", "steps": [...]}
+        if isinstance(config, DictConfig):
+            config = OmegaConf.to_container(config, resolve=True)
+        step_configs = config["steps"]
+        pipeline_name = config.get("name", "policy_preprocessor")
+    elif isinstance(config, list):
+        # Concise list format
+        step_configs = config
+        pipeline_name = "policy_preprocessor"
+    else:
+        raise ValueError(
+            f"Config must be a dict with 'steps' key or a list, got {type(config)}"
+        )
+
+    steps = []
+    for step_entry in step_configs:
+        # Determine step format and normalize to standard dict
+        if isinstance(step_entry, str):
+            # Concise format: "step_name"
+            step_dict = {"registry_name": step_entry, "config": {}}
+        elif isinstance(step_entry, (dict, DictConfig)):
+            if "registry_name" in step_entry:
+                # JSON format: {"registry_name": "...", "config": {...}}
+                if isinstance(step_entry, DictConfig):
+                    step_entry = OmegaConf.to_container(step_entry, resolve=True)
+                step_dict = step_entry
+            elif len(step_entry) == 1:
+                # Concise format: {"step_name": {...}}
+                step_name = next(iter(step_entry.keys()))
+                step_config = step_entry[step_name]
+                if isinstance(step_config, DictConfig):
+                    step_config = OmegaConf.to_container(step_config, resolve=True)
+                step_dict = {"registry_name": step_name, "config": step_config}
+            else:
+                raise ValueError(
+                    f"Step config dict must have either 'registry_name' or exactly one key, "
+                    f"got {list(step_entry.keys())}"
+                )
+        else:
+            raise ValueError(
+                f"Step config must be str or dict, got {type(step_entry)}: {step_entry}"
+            )
+
+        # Get step class
+        registry_name = step_dict["registry_name"]
+        step_class = ProcessorStepRegistry.get(registry_name)
+
+        # Merge config with overrides (overrides take precedence)
+        try:
+            base_config = step_dict.get("config", {})
+            step_overrides = overrides.get(registry_name, {})
+            merged_config = {**base_config, **step_overrides}
+
+            step_instance = step_class(**merged_config)
+            steps.append(step_instance)
+        except Exception as e:
+            raise ValueError(
+                f"Failed to instantiate processor step '{registry_name}' "
+                f"with config {merged_config}. Error: {e!s}"
+            ) from e
+
+    return PolicyProcessorPipeline(
+        steps=steps,
+        name=pipeline_name,
+    )
+
+
+def make_pre_post_processors(
+    pretrained_path: str | None = None,
+    **kwargs: Unpack[ProcessorConfigKwargs],
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    """
+    Create or load pre- and post-processor pipelines for a given policy.
+
+    This function acts as a factory. It can either load existing processor pipelines
+    from a pretrained path or create new ones from scratch based on the policy
+    configuration. Each policy type has a dedicated factory function for its
+    processors (e.g., `make_tdmpc_pre_post_processors`).
+
+    Args:
+        policy_cfg: The configuration of the policy for which to create processors.
+        pretrained_path: An optional path to load pretrained processor pipelines from.
+            If provided, pipelines are loaded from this path.
+        **kwargs: Keyword arguments for processor configuration, as defined in
+            `ProcessorConfigKwargs`.
+
+    Returns:
+        A tuple containing the input (pre-processor) and output (post-processor) pipelines.
+
+    Raises:
+        NotImplementedError: If a processor factory is not implemented for the given
+            policy configuration type.
+    """
+    return (
+        PolicyProcessorPipeline.from_pretrained(
+            pretrained_model_name_or_path=pretrained_path,
+            config_filename=kwargs.get(
+                "preprocessor_config_filename",
+                f"{POLICY_PREPROCESSOR_DEFAULT_NAME}.json",
+            ),
+            overrides=kwargs.get("preprocessor_overrides", {}),
+            to_transition=batch_to_transition,
+            to_output=transition_to_batch,
+        ),
+        PolicyProcessorPipeline.from_pretrained(
+            pretrained_model_name_or_path=pretrained_path,
+            config_filename=kwargs.get(
+                "postprocessor_config_filename",
+                f"{POLICY_POSTPROCESSOR_DEFAULT_NAME}.json",
+            ),
+            overrides=kwargs.get("postprocessor_overrides", {}),
+            to_transition=policy_action_to_transition,
+            to_output=transition_to_policy_action,
+        ),
+    )
+
+
+def has_method(cls: object, method_name: str) -> bool:
+    return hasattr(cls, method_name) and callable(getattr(cls, method_name))
+
+
+def update_policy(
+    train_metrics: MetricsTracker,
+    policy,
+    batch: Any,
+    optimizer: Optimizer,
+    use_amp: bool,
+    grad_clip_norm: float,
+    lr_scheduler=None,
+    lock=None,
+) -> tuple[MetricsTracker, dict]:
+    """
+    Performs a single training step to update the policy's weights.
+
+    This function executes the forward and backward passes, clips gradients, and steps the optimizer and
+    learning rate scheduler. Uses accelerate for distributed training (matching starVLA).
+
+    Args:
+        train_metrics: A MetricsTracker instance to record training statistics.
+        policy: The policy model to be trained (wrapped by accelerator).
+        batch: A batch of training data.
+        optimizer: The optimizer used to update the policy's parameters.
+        grad_clip_norm: The maximum norm for gradient clipping.
+        lr_scheduler: An optional learning rate scheduler.
+        lock: An optional lock for thread-safe optimizer updates.
+
+    Returns:
+        A tuple containing:
+        - The updated MetricsTracker with new statistics for this step.
+        - A dictionary of outputs from the policy's forward pass, for logging purposes.
+    """
+    start_time = time.perf_counter()
+
+    # Get the policy model (unwrap accelerator if needed) to access config
+    policy_model = accelerator.unwrap_model(policy)
+
+    print(f"use_amp: {use_amp}")
+
+    # Use accelerator.accumulate for gradient accumulation support (matching starVLA)
+    with accelerator.accumulate(policy):
+        optimizer.zero_grad()
+
+        autocast_context = (
+            torch.amp.autocast("cuda", dtype=torch.bfloat16) if use_amp else nullcontext()
+        )
+
+        with autocast_context:
+            loss = policy.forward(batch)
+
+        # Use accelerator.backward instead of loss.backward() (matching starVLA)
+        accelerator.backward(loss)
+
+        # Clip gradients using accelerator (matching starVLA)
+        grad_norm = None
+        if grad_clip_norm > 0:
+            grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
+        else:
+            # Compute grad norm even if not clipping
+            grad_norm = accelerator.clip_grad_norm_(policy.parameters(), float("inf"))
+
+        with lock if lock is not None else nullcontext():
+            optimizer.step()
+
+        # Step through pytorch scheduler at every batch instead of epoch
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+    # Update internal buffers if policy has update method
+    if has_method(policy_model, "update"):
+        policy_model.update()
+
+    train_metrics.loss = loss.item()
+    train_metrics.grad_norm = grad_norm.item() if grad_norm is not None else 0.0
+    train_metrics.lr = optimizer.param_groups[0]["lr"]
+    train_metrics.update_s = time.perf_counter() - start_time
+
+    return train_metrics
+
+
+# Commented out: old update_policy using manual DDP
+# def update_policy_old(
+#     train_metrics: MetricsTracker,
+#     policy,
+#     batch: Any,
+#     optimizer: Optimizer,
+#     use_amp: bool,
+#     grad_clip_norm: float,
+#     lr_scheduler=None,
+#     lock=None,
+# ) -> tuple[MetricsTracker, dict]:
+#     start_time = time.perf_counter()
+#     policy_model = policy.module if isinstance(policy, DDP) else policy
+#     print(f"use_amp: {use_amp}")
+#     autocast_context = (
+#         torch.amp.autocast("cuda", dtype=torch.bfloat16) if use_amp else nullcontext()
+#     )
+#     with autocast_context:
+#         loss = policy.forward(batch)
+#     loss.backward()
+#     if grad_clip_norm > 0:
+#         grad_norm = torch.nn.utils.clip_grad_norm_(
+#             policy.module.parameters() if isinstance(policy, DDP) else policy.parameters(),
+#             grad_clip_norm,
+#         )
+#     else:
+#         grad_norm = torch.nn.utils.clip_grad_norm_(
+#             policy.module.parameters() if isinstance(policy, DDP) else policy.parameters(),
+#             float("inf"),
+#             error_if_nonfinite=False,
+#         )
+#     with lock if lock is not None else nullcontext():
+#         optimizer.step()
+#     optimizer.zero_grad()
+#     if lr_scheduler is not None:
+#         lr_scheduler.step()
+#     if has_method(policy_model, "update"):
+#         policy_model.update()
+#     train_metrics.loss = loss.item()
+#     train_metrics.grad_norm = grad_norm.item()
+#     train_metrics.lr = optimizer.param_groups[0]["lr"]
+#     train_metrics.update_s = time.perf_counter() - start_time
+#     return train_metrics
+
+
+def main(config: TrainConfig, seed: int):
+
+    # import debugpy
+    # debugpy.listen(("0.0.0.0", 9096))
+    # debugpy.wait_for_client()
+    # debugpy.breakpoint()
+
+    set_seed(seed)
+    print(f"[DEBUG RNG main] After set_seed: torch state[:10] = {torch.get_rng_state()[:10].tolist()}")
+
+    # Use accelerator instead of manual DDP (matching starVLA)
+    device = accelerator.device
+    is_main_process = accelerator.is_main_process
+    accelerator.print(accelerator.state)
+    print(f"[DEBUG RNG main] After accelerator setup: torch state[:10] = {torch.get_rng_state()[:10].tolist()}")
+
+    # Commented out: old manual DDP initialization
+    # local_rank = init_ddp()
+    # device = torch.device("cuda", local_rank)
+    # rank = dist.get_rank()
+    # is_main_process = rank == 0 and local_rank == 0
+    # print(f"[DEBUG RNG main] After init_ddp: torch state[:10] = {torch.get_rng_state()[:10].tolist()}")
+
+    dataset = make_dataset(config.data)
+    print(f"[DEBUG RNG main] After make_dataset: torch state[:10] = {torch.get_rng_state()[:10].tolist()}")
+
+    accelerator.wait_for_everyone()  # Use accelerator instead of dist.barrier()
+
+    # Reset seed before model creation to match starVLA initialization order
+    # (starVLA creates model before dataset, so we reset seed to get same weights)
+    # set_seed(seed)
+    # print(f"[DEBUG RNG main] After 2nd set_seed: torch state[:10] = {torch.get_rng_state()[:10].tolist()}")
+
+    policy, input_features, output_features = make_policy(config=config, ds_meta=dataset.meta)
+    # register_debug_hooks(policy)
+
+    accelerator.wait_for_everyone()  # Use accelerator instead of dist.barrier()
+
+    # Create processors - only provide dataset_stats if not resuming from saved processors
+    processor_kwargs = {}
+    postprocessor_kwargs = {}
+    # Only provide dataset_stats when not resuming from saved processor state
+    processor_kwargs["dataset_stats"] = dataset.meta.stats
+
+    # Prepare overrides for preprocessor steps
+    preprocessor_overrides = {
+        "device_processor": {"device": device.type},
+        "normalizer_processor": {
+            "stats": dataset.meta.stats,
+            "features": {
+                **input_features,
+                **output_features,
+            }
+        },
+        # "tokenizer_processor": {"tokenizer_name": config.model.tokenizer_path},
+    }
+
+    num_workers = 0 # config.system.num_workers
+    shuffle = config.system.shuffle
+
+    # # Wrap dataset with StarVLAFormatDataset for starVLA-compatible output format
+    # image_keys = getattr(config.data, "image_keys", None) or [
+    #     "observation.images.image",
+    #     "observation.images.wrist_image",
+    # ]
+    # starvla_dataset = StarVLAFormatDataset(
+    #     dataset,
+    #     image_keys=image_keys,
+    #     image_size=(224, 224),
+    # )
+
+    # DistributedSampler ensures each rank gets different data
+    # Use accelerator's process info (matching starVLA pattern)
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset,
+        # starvla_dataset,
+        num_replicas=accelerator.num_processes,
+        rank=accelerator.process_index,
+        shuffle=shuffle,
+        drop_last=False,
+    )
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        # starvla_dataset,
+        num_workers=num_workers,
+        batch_size=config.system.batch_size,
+        shuffle=False,  # Must be False when using sampler
+        sampler=sampler,
+        pin_memory=True,
+        drop_last=False,
+        prefetch_factor=2 if num_workers > 0 else None,
+        # collate_fn=collate_fn_starvla,  # Return batch as list of dicts (starVLA style)
+    )
+
+    # Setup preprocessor
+    preprocessor = None
+    if config.data.preprocessor is not None:
+        preprocessor = make_preprocessor_from_config(
+            config.data.preprocessor,
+            overrides=preprocessor_overrides
+        )
+
+    # Setup optimizer and scheduler (applies freeze config before accelerator.prepare)
+    optimizer, lr_scheduler = setup_optimizer_and_scheduler(policy, config)
+
+    # Use accelerator.prepare instead of manual DDP wrapping (matching starVLA)
+    # This handles DDP wrapping, moving to device, etc.
+    accelerator.dataloader_config.dispatch_batches = False  # Match starVLA setting
+    policy, optimizer, dataloader = accelerator.prepare(policy, optimizer, dataloader)
+
+    # Commented out: old manual DDP wrapping
+    # policy = DDP(
+    #     policy,
+    #     device_ids=[local_rank],
+    #     find_unused_parameters=True,
+    #     output_device=local_rank,
+    # )
+
+    accelerator.wait_for_everyone()  # Use accelerator instead of dist.barrier()
+
+    dl_iter = cycle(dataloader)
+
+    # policy.train()
+
+    train_metrics = {
+        "loss": AverageMeter("loss", ":.3f"),
+        "grad_norm": AverageMeter("grdn", ":.3f"),
+        "lr": AverageMeter("lr", ":0.1e"),
+        "update_s": AverageMeter("updt_s", ":.3f"),
+        "dataloading_s": AverageMeter("data_s", ":.3f"),
+    }
+
+    # Use accelerator.num_processes instead of dist.get_world_size()
+    effective_batch_size = config.system.batch_size * accelerator.num_processes
+
+    step = 0
+
+    train_tracker = MetricsTracker(
+        effective_batch_size,
+        dataset.num_frames,
+        dataset.num_episodes,
+        train_metrics,
+        initial_step=step,
+    )
+
+    # To ensures proper data shuffling across epochs in distributed training
+    epoch = 0
+    samples_per_epoch = len(dataset) // effective_batch_size
+    sampler.set_epoch(epoch)
+
+    action_stats = dataset.meta.stats.get("action", {})
+    if is_main_process:
+        print(f"[DEBUG GRIPPER] action stats min: {action_stats.get('min', 'N/A')}")
+        print(f"[DEBUG GRIPPER] action stats max: {action_stats.get('max', 'N/A')}")
+    _debug_dumped = False
+
+    for _ in range(step, config.system.train_steps):
+        start_time = time.perf_counter()
+        batch = next(dl_iter)
+        batch = {
+            k: v.to(device, non_blocking=True) if isinstance(v, torch.Tensor) else v
+            for k, v in batch.items()
+        }
+
+        if not _debug_dumped and is_main_process and "action" in batch:
+            print(f"[DEBUG GRIPPER] BEFORE preproc action[0,0,:]: {batch['action'][0, 0, :].tolist()}")
+            print(f"[DEBUG GRIPPER] BEFORE preproc action[:,: ,6] (gripper): {batch['action'][:, :, 6].flatten()[:16].tolist()}")
+
+        if preprocessor is not None:
+            batch = preprocessor(batch)
+        train_tracker.dataloading_s = time.perf_counter() - start_time
+
+        if not _debug_dumped and is_main_process and "action" in batch:
+            print(f"[DEBUG GRIPPER] AFTER preproc action[0,0,:]: {batch['action'][0, 0, :].tolist()}")
+            print(f"[DEBUG GRIPPER] AFTER preproc action[:,:,6] (gripper): {batch['action'][:, :, 6].flatten()[:16].tolist()}")
+            _debug_dumped = True
+
+        st = time.perf_counter()
+        train_tracker = update_policy(
+            train_tracker,
+            policy,
+            batch,
+            optimizer,
+            use_amp=config.system.use_amp,
+            grad_clip_norm=config.system.grad_clip_norm,
+            lr_scheduler=lr_scheduler,
+        )
+        print(f"update_policy time: {time.perf_counter() - st}")
+        print(f"train_tracker at step {step}: {format_train_tracker_step(train_tracker)}")
+
+        step += 1
+        train_tracker.step()
+
+        # Update epoch counter for sampler.set_epoch() when we've processed one epoch worth of samples
+        # This ensures proper data shuffling across epochs in distributed training
+        if samples_per_epoch > 0 and step % samples_per_epoch == 0:
+            epoch += 1
+            sampler.set_epoch(epoch)
+
+        if step % config.system.log_freq == 0 and is_main_process:
+            logger.info(f"step: {step} {format_train_tracker_step(train_tracker)}")
+            train_tracker.reset_averages()
+
+        if (
+            config.system.checkpoint.save_checkpoint
+            and step % config.system.checkpoint.save_freq == 0
+        ):
+            # Synchronize all processes before checkpoint saving
+            accelerator.wait_for_everyone()
+
+            if is_main_process:
+                from pathlib import Path
+                logger.info(f"Saving checkpoint at step {step}")
+                output_dir = Path(config.system.checkpoint.output_directory)
+                checkpoint_dir = get_step_checkpoint_dir(
+                    output_dir, config.system.train_steps, step
+                )
+                # Use accelerator.unwrap_model instead of policy.module
+                policy_to_save = accelerator.unwrap_model(policy)
+                save_checkpoint(
+                    checkpoint_dir=checkpoint_dir,
+                    policy=policy_to_save,
+                    config=config,
+                    preprocessor=preprocessor,
+                )
+                update_last_checkpoint(checkpoint_dir)
+
+            # Synchronize all processes after checkpoint saving
+            accelerator.wait_for_everyone()
+
+    if is_main_process:
+        logger.info("Training completed")
+
+    # Properly clean up using accelerator (matching starVLA)
+    accelerator.wait_for_everyone()
+    # Note: accelerator handles process group cleanup automatically
+    # dist.destroy_process_group()  # Commented out: handled by accelerator
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Train QwenGr00t model. This script is typically called by the flagscale runner, not directly."
+    )
+    parser.add_argument(
+        "--config-file", type=str, required=True, help="Path to the configuration YAML file"
+    )
+    args = parser.parse_args()
+
+    config_file_path = args.config_file
+
+    # Load config from YAML file (Hydra-generated config.yaml contains both train and experiment)
+    config = OmegaConf.load(config_file_path)
+
+    logger.info(f"full config: {config}")
+
+    # Extract train config and convert to Pydantic TrainConfig (preserves raw configs)
+    train_config = TrainConfig.from_hydra_config(config)
+
+    # Extract experiment config (seed, exp_dir, etc.)
+    experiment_config = OmegaConf.to_container(config.experiment, resolve=True)
+    seed = experiment_config.get("seed", 42)
+
+    logger.info("=" * 100)
+    logger.info(f"Experiment: {experiment_config}")
+    logger.info(f"Train config: {train_config}")
+
+    main(train_config, seed)
diff --git a/flagscale/train/utils/image_tools.py b/flagscale/train/utils/image_tools.py
new file mode 100644
index 0000000000..d6a1978a71
--- /dev/null
+++ b/flagscale/train/utils/image_tools.py
@@ -0,0 +1,127 @@
+import numpy as np
+from PIL import Image
+
+
+def convert_to_uint8(img: np.ndarray) -> np.ndarray:
+    """Converts an image to uint8 if it is a float image.
+
+    This is important for reducing the size of the image when sending it over the network.
+    """
+    if np.issubdtype(img.dtype, np.floating):
+        img = (255 * img).astype(np.uint8)
+    return img
+
+
+def resize_with_pad(
+    images: np.ndarray, height: int, width: int, method=Image.BILINEAR
+) -> np.ndarray:
+    """Replicates tf.image.resize_with_pad for multiple images using PIL. Resizes a batch of images to a target height.
+
+    Args:
+        images: A batch of images in [..., height, width, channel] format.
+        height: The target height of the image.
+        width: The target width of the image.
+        method: The interpolation method to use. Default is bilinear.
+
+    Returns:
+        The resized images in [..., height, width, channel].
+    """
+    # If the images are already the correct size, return them as is.
+    if images.shape[-3:-1] == (height, width):
+        return images
+
+    original_shape = images.shape
+
+    images = images.reshape(-1, *original_shape[-3:])
+    resized = np.stack(
+        [_resize_with_pad_pil(Image.fromarray(im), height, width, method=method) for im in images]
+    )
+    return resized.reshape(*original_shape[:-3], *resized.shape[-3:])
+
+
+def _resize_with_pad_pil(image: Image.Image, height: int, width: int, method: int) -> Image.Image:
+    """Replicates tf.image.resize_with_pad for one image using PIL. Resizes an image to a target height and
+    width without distortion by padding with zeros.
+
+    Unlike the jax version, note that PIL uses [width, height, channel] ordering instead of [batch, h, w, c].
+    """
+    cur_width, cur_height = image.size
+    if cur_width == width and cur_height == height:
+        return image  # No need to resize if the image is already the correct size.
+
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_image = image.resize((resized_width, resized_height), resample=method)
+
+    zero_image = Image.new(resized_image.mode, (width, height), 0)
+    pad_height = max(0, int((height - resized_height) / 2))
+    pad_width = max(0, int((width - resized_width) / 2))
+    zero_image.paste(resized_image, (pad_width, pad_height))
+    assert zero_image.size == (width, height)
+    return zero_image
+
+
+from typing import Any
+
+
+def to_pil_preserve(images: Any, scale_float: bool = True):
+    """
+    Convert (possibly nested) numpy image arrays back to PIL.Image WITHOUT changing spatial shape
+    or nesting structure.
+
+    Accepts:
+      - np.ndarray with shape (H, W, C), C in {1,3,4}, dtype uint8 or float
+      - PIL.Image.Image (returned as-is)
+      - Nested list / tuple structures containing the above
+
+    Guarantees:
+      - No resize / pad / crop performed
+      - Returns an object with the SAME nesting layout (list -> list, tuple -> tuple)
+      - Only dtype (float -> uint8) and channel-mode adaptation may happen
+        * float arrays assumed in [0,1] if scale_float=True (scaled *255 + clip)
+    Args:
+      images: input object / sequence
+      scale_float: whether to scale float images in [0,1] to uint8
+    Returns:
+      Mirrored structure with all leaf nodes as PIL.Image.Image
+    """
+
+    def _convert(obj):
+        # Nested containers
+        if isinstance(obj, list):
+            return [_convert(x) for x in obj]
+        if isinstance(obj, tuple):
+            return tuple(_convert(x) for x in obj)
+
+        # PIL stays
+        if isinstance(obj, Image.Image):
+            return obj
+
+        # numpy -> PIL
+        if isinstance(obj, np.ndarray):
+            arr = obj
+            if arr.ndim != 3:
+                raise ValueError(f"Expected 3D array (H,W,C), got shape={arr.shape}")
+            if arr.shape[2] not in (1, 3, 4):
+                raise ValueError(f"Channel count must be 1/3/4, got {arr.shape[2]}")
+            if np.issubdtype(arr.dtype, np.floating):
+                if scale_float:
+                    arr = np.clip(arr, 0.0, 1.0)
+                    arr = (arr * 255.0 + 0.5).astype(np.uint8)
+                else:
+                    raise TypeError("Float array provided but scale_float=False")
+            elif arr.dtype != np.uint8:
+                arr = arr.astype(np.uint8)
+
+            # Single channel -> 'L'
+            if arr.shape[2] == 1:
+                arr = arr[:, :, 0]
+                return Image.fromarray(arr, mode="L")
+            # 3 channels -> RGB, 4 -> RGBA
+            mode = "RGB" if arr.shape[2] == 3 else "RGBA"
+            return Image.fromarray(arr, mode=mode)
+
+        raise TypeError(f"Unsupported element type: {type(obj)}")
+
+    return _convert(images)
diff --git a/flagscale/train/utils/optim_setup.py b/flagscale/train/utils/optim_setup.py
new file mode 100644
index 0000000000..6b6de3b2ce
--- /dev/null
+++ b/flagscale/train/utils/optim_setup.py
@@ -0,0 +1,329 @@
+"""Optimizer setup utilities: parameter freezing and per-module optimizer config.
+
+Supports:
+- Freezing parameters via regex patterns
+- Per-module optimizer settings (lr, weight_decay, betas, etc.) via config
+
+Example config (YAML):
+    model:
+      freeze:
+        freeze_patterns: ["backbone.*"]
+    system:
+      optimizer:
+        lr: 1e-4
+        weight_decay: 0.01
+        param_groups:
+          qwen_backbone:
+            lr: 1e-5
+          action_head:
+            lr: 2e-4
+            weight_decay: 0.0
+"""
+
+import re
+from collections import defaultdict
+from collections.abc import Generator, Iterable
+from typing import TYPE_CHECKING, Any
+
+import torch
+import torch.nn as nn
+from transformers import get_scheduler
+
+from flagscale.runner.utils import logger
+
+if TYPE_CHECKING:
+    from flagscale.train.train_config import (
+        FreezeConfig,
+        OptimizerConfig,
+        SchedulerConfig,
+        TrainConfig,
+    )
+
+
+class PatternMatcher:
+    """Helper for matching parameter names against regex patterns with usage tracking."""
+
+    def __init__(self, patterns: list[str]):
+        self.patterns = patterns
+        self.compiled = [re.compile(p) for p in patterns]
+        self.match_counts = {p: 0 for p in patterns}
+
+    def matches(self, name: str) -> bool:
+        for i, pattern in enumerate(self.compiled):
+            if pattern.search(name):
+                self.match_counts[self.patterns[i]] += 1
+                return True
+        return False
+
+    def get_unused_patterns(self) -> list[str]:
+        return [p for p, count in self.match_counts.items() if count == 0]
+
+
+def freeze_and_get_trainable_params(
+    named_parameters: Iterable[tuple[str, torch.nn.Parameter]],
+    freeze_patterns: list[str] | None = None,
+    keep_patterns: list[str] | None = None,
+) -> Generator[torch.nn.Parameter, None, None]:
+    """
+    Freeze parameters matching patterns and yield only trainable parameters.
+
+    Args:
+        named_parameters: Output of model.named_parameters()
+        freeze_patterns: Regex patterns for params to freeze
+        keep_patterns: Regex patterns for params to keep trainable (overrides freeze_patterns)
+
+    Yields:
+        Only parameters that should be trained (for optimizer).
+    """
+    freeze_matcher = PatternMatcher(freeze_patterns or [])
+    keep_matcher = PatternMatcher(keep_patterns or [])
+
+    trainable_count, frozen_count = 0, 0
+
+    for name, param in named_parameters:
+        should_freeze = freeze_matcher.matches(name) and not keep_matcher.matches(name)
+
+        if should_freeze:
+            param.requires_grad = False
+            frozen_count += param.numel()
+        else:
+            param.requires_grad = True
+            trainable_count += param.numel()
+            yield param
+
+    # Log summary
+    total = trainable_count + frozen_count
+    pct = trainable_count / total if total > 0 else 0
+    logger.info(
+        f"Parameters: trainable={trainable_count:,} ({pct:.2%}) | "
+        f"frozen={frozen_count:,} | total={total:,}"
+    )
+
+    # Warn about unused patterns
+    unused_freeze = freeze_matcher.get_unused_patterns()
+    if unused_freeze:
+        logger.warning(f"Freeze patterns matched nothing: {unused_freeze}")
+
+    unused_keep = keep_matcher.get_unused_patterns()
+    if unused_keep:
+        logger.warning(f"Keep patterns matched nothing: {unused_keep}")
+
+
+def apply_freeze_config(model: nn.Module, freeze_config) -> list:
+    """
+    Apply freeze config and return list of trainable parameters for optimizer.
+
+    Args:
+        model: The model to freeze
+        freeze_config: FreezeConfig with freeze_patterns and keep_patterns
+
+    Returns:
+        List of trainable parameters (pass directly to optimizer)
+    """
+    if freeze_config is None:
+        return list(model.parameters())
+
+    return list(
+        freeze_and_get_trainable_params(
+            model.named_parameters(),
+            freeze_patterns=freeze_config.freeze_patterns,
+            keep_patterns=freeze_config.keep_patterns,
+        )
+    )
+
+
+def log_trainable_params(model: nn.Module) -> dict:
+    """Log trainable/frozen parameter statistics by module."""
+    trainable_by_module = defaultdict(int)
+    frozen_by_module = defaultdict(int)
+
+    for name, param in model.named_parameters():
+        module_name = name.split(".")[0]
+        if param.requires_grad:
+            trainable_by_module[module_name] += param.numel()
+        else:
+            frozen_by_module[module_name] += param.numel()
+
+    logger.info("=" * 60)
+    logger.info("Parameter status by top-level module:")
+    all_modules = set(trainable_by_module.keys()) | set(frozen_by_module.keys())
+    for mod in sorted(all_modules):
+        t = trainable_by_module.get(mod, 0)
+        f = frozen_by_module.get(mod, 0)
+        logger.info(f"  {mod}: {t:,} trainable, {f:,} frozen")
+    logger.info("=" * 60)
+
+    return {"trainable": dict(trainable_by_module), "frozen": dict(frozen_by_module)}
+
+
+def print_param_names(model: nn.Module, pattern: str | None = None):
+    """Debug helper: print parameter names (optionally filtered by pattern)."""
+    for name, param in model.named_parameters():
+        if pattern is None or re.search(pattern, name):
+            status = "trainable" if param.requires_grad else "FROZEN"
+            print(f"[{status}] {name}: {param.numel():,} params")
+
+
+# TODO: (yupu) Freeze supports regex patterns, but param groups uses exact module paths. See if this is reasonable.
+def build_optim_param_groups(
+    model: nn.Module,
+    optim_param_groups_config: dict[str, dict[str, Any]] | None = None,
+) -> list[dict]:
+    """
+    Build optimizer param groups with per-module settings.
+
+    Each module can have its own optimizer hyperparameters (lr, weight_decay, betas, etc.).
+    Parameters not belonging to any specified module go into a default group.
+
+    Args:
+        model: The model to create param groups for.
+        optim_param_groups_config: Dict mapping module names to optimizer kwargs.
+            Example: {"encoder": {"lr": 1e-5}, "decoder": {"lr": 1e-3, "weight_decay": 0.01}}
+            Supports nested paths like "action_head.mlp".
+
+    Returns:
+        List of param group dicts for optimizer.
+    """
+    if optim_param_groups_config is None:
+        return [{"params": [p for p in model.parameters() if p.requires_grad]}]
+
+    param_groups = []
+    used_param_ids = set()
+
+    for module_name, group_config in optim_param_groups_config.items():
+        try:
+            module = model.get_submodule(module_name)
+        except AttributeError:
+            # TODO: (yupu) logger can't print the current module name and line number
+            logger.warning(
+                f"build_optim_param_groups: Module '{module_name}' not found in model, skipping."
+            )
+            continue
+
+        params = [p for p in module.parameters() if p.requires_grad]
+        if not params:
+            logger.warning(
+                f"build_optim_param_groups: Module '{module_name}' has no trainable parameters."
+            )
+            continue
+
+        used_param_ids.update(id(p) for p in params)
+        param_groups.append({"params": params, "name": module_name, **group_config})
+
+        param_count = sum(p.numel() for p in params)
+        logger.info(f"Param group '{module_name}': {param_count:,} params, {group_config}")
+
+    # Remaining params go to default group
+    other_params = [
+        p for p in model.parameters() if p.requires_grad and id(p) not in used_param_ids
+    ]
+    if other_params:
+        param_groups.insert(0, {"params": other_params, "name": "default"})
+        logger.info(f"Param group 'default': {sum(p.numel() for p in other_params):,} params")
+
+    return param_groups
+
+
+def setup_optimizer(
+    model: nn.Module,
+    optimizer_config: "OptimizerConfig",
+    freeze_config: "FreezeConfig | None" = None,
+) -> torch.optim.Optimizer:
+    """
+    One-stop setup: apply freeze config, build param groups, create optimizer.
+
+    Args:
+        model: The model to optimize.
+        optimizer_config: OptimizerConfig with name, lr, betas, eps, weight_decay, param_groups.
+        freeze_config: FreezeConfig with freeze_patterns and keep_patterns.
+
+    Returns:
+        Configured optimizer instance.
+    """
+    if freeze_config is not None:
+        apply_freeze_config(model, freeze_config)
+        log_trainable_params(model)
+
+    param_groups = build_optim_param_groups(model, optimizer_config.param_groups)
+    optimizer_kwargs = {"params": param_groups, **optimizer_config.get_optimizer_kwargs()}
+
+    # Get optimizer class by name
+    optimizer_cls = _get_optimizer_class(optimizer_config.name)
+    return optimizer_cls(**optimizer_kwargs)
+
+
+# Supported optimizers
+_OPTIMIZER_REGISTRY: dict[str, type[torch.optim.Optimizer]] = {
+    "AdamW": torch.optim.AdamW,
+}
+
+
+def _get_optimizer_class(name: str) -> type[torch.optim.Optimizer]:
+    """Get optimizer class by name."""
+    if name not in _OPTIMIZER_REGISTRY:
+        supported = list(_OPTIMIZER_REGISTRY.keys())
+        raise ValueError(f"Unsupported optimizer: {name}. Supported: {supported}")
+    return _OPTIMIZER_REGISTRY[name]
+
+
+def setup_scheduler(
+    optimizer: torch.optim.Optimizer,
+    scheduler_config: "SchedulerConfig",
+    num_training_steps: int,
+) -> torch.optim.lr_scheduler.LRScheduler:
+    """
+    Create LR scheduler using transformers' get_scheduler.
+
+    Args:
+        optimizer: The optimizer to schedule.
+        scheduler_config: Config with name, warmup_steps, scheduler_kwargs.
+        num_training_steps: Total training steps.
+
+    Returns:
+        A learning rate scheduler instance.
+
+    Raises:
+        ValueError: If scheduler_config.name is None.
+    """
+
+    if scheduler_config.name is None:
+        raise ValueError("scheduler_config.name must be specified to use setup_scheduler")
+
+    return get_scheduler(
+        name=scheduler_config.name,
+        optimizer=optimizer,
+        num_warmup_steps=scheduler_config.warmup_steps,
+        num_training_steps=num_training_steps,
+        scheduler_specific_kwargs=scheduler_config.scheduler_kwargs,
+    )
+
+
+def setup_optimizer_and_scheduler(
+    model: nn.Module,
+    train_config: "TrainConfig",
+) -> tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]:
+    """
+    One-stop setup for both optimizer and scheduler from TrainConfig.
+
+    Args:
+        model: The model to optimize.
+        train_config: TrainConfig containing system (optimizer, scheduler, train_steps)
+            and model (freeze config).
+
+    Returns:
+        Tuple of (optimizer, lr_scheduler).
+
+    Raises:
+        ValueError: If scheduler_config.name is None.
+    """
+    optimizer = setup_optimizer(
+        model,
+        train_config.system.optimizer,
+        freeze_config=train_config.model.freeze,
+    )
+    scheduler = setup_scheduler(
+        optimizer,
+        train_config.system.scheduler,
+        num_training_steps=train_config.system.train_steps,
+    )
+    return optimizer, scheduler
diff --git a/flagscale/train/utils/train_utils.py b/flagscale/train/utils/train_utils.py
index 6a0b6e1c8a..ed695b3b2c 100644
--- a/flagscale/train/utils/train_utils.py
+++ b/flagscale/train/utils/train_utils.py
@@ -17,18 +17,13 @@
 # limitations under the License.
 from pathlib import Path
 
-# from lerobot.configs.train import TrainPipelineConfig
-from flagscale.models.pi0.modeling_pi0 import PI0Policy
+from omegaconf import OmegaConf
+from safetensors.torch import load_model, save_model
 
-# from lerobot.optim.optimizers import load_optimizer_state, save_optimizer_state
-# from lerobot.optim.schedulers import load_scheduler_state, save_scheduler_state
-# from lerobot.policies.pretrained import PreTrainedPolicy
-# from lerobot.processor import PolicyProcessorPipeline
 from flagscale.models.utils.constants import (
     CHECKPOINTS_DIR,
     LAST_CHECKPOINT_LINK,
     PRETRAINED_MODEL_DIR,
-    # TRAINING_STATE_DIR,
     TRAINING_STEP,
 )
 from flagscale.train.datasets.utils import load_json, write_json
@@ -66,46 +61,128 @@ def update_last_checkpoint(checkpoint_dir: Path) -> Path:
 
 def save_checkpoint(
     checkpoint_dir: Path,
-    # step: int,
-    # cfg: PI0Config,
-    policy: PI0Policy,
-    # optimizer: Optimizer,
-    # scheduler: LRScheduler | None = None,
-    # preprocessor: PolicyProcessorPipeline | None = None,
-    # postprocessor: PolicyProcessorPipeline | None = None,
+    policy,
+    config,
+    preprocessor=None,
+    postprocessor=None,
 ) -> None:
-    """This function creates the following directory structure:
-
-    005000/  #  training step at checkpoint
-    ├── pretrained_model/
-    │   ├── config.json  # policy config
-    │   ├── model.safetensors  # policy weights
-    │   ├── train_config.json  # train config
-    │   ├── processor.json  # processor config (if preprocessor provided)
-    │   └── step_*.safetensors  # processor state files (if any)
-    └── training_state/
-        ├── optimizer_param_groups.json  #  optimizer param groups
-        ├── optimizer_state.safetensors  # optimizer state
-        ├── rng_state.safetensors  # rng states
-        ├── scheduler_state.json  # scheduler state
-        └── training_step.json  # training step
+    """Save model weights, config, and preprocessor state.
+
+    Creates the following directory structure:
+        005000/
+        └── pretrained_model/
+            ├── train_config.yaml              # train config (OmegaConf)
+            ├── model.safetensors              # All weights (VLM + action head)
+            ├── policy_preprocessor.json       # Preprocessor pipeline config
+            └── policy_preprocessor_step_*.safetensors  # Norm stats
+
+    Args:
+        checkpoint_dir: Directory to save checkpoint (e.g., checkpoints/005000)
+        policy: The model
+        config: Training config (OmegaConf, Pydantic, or dict)
+        preprocessor: Optional PolicyProcessorPipeline
+    """
+    pretrained_dir = checkpoint_dir / PRETRAINED_MODEL_DIR
+    pretrained_dir.mkdir(parents=True, exist_ok=True)
+
+    # Save train config as YAML
+    # Handle OmegaConf, Pydantic, and dict configs
+    if hasattr(config, "model_dump"):
+        config = OmegaConf.create(config.model_dump())
+    elif not OmegaConf.is_config(config):
+        config = OmegaConf.create(config)
+    OmegaConf.save(config, pretrained_dir / "train_config.yaml")
+
+    # Save model weights (save_model handles shared tensors like tied embeddings)
+    save_model(policy, pretrained_dir / "model.safetensors")
+
+    if preprocessor is not None:
+        preprocessor.save_pretrained(pretrained_dir)
+    if postprocessor is not None:
+        postprocessor.save_pretrained(pretrained_dir)
+
+
+def load_checkpoint(
+    checkpoint_dir: str | Path,
+    model_cls,
+    device: str = "cpu",
+):
+    """Load config, model weights, and preprocessor from checkpoint.
 
     Args:
-        cfg (TrainPipelineConfig): The training config used for this run.
-        step (int): The training step at that checkpoint.
-        policy (PreTrainedPolicy): The policy to save.
-        optimizer (Optimizer | None, optional): The optimizer to save the state from. Defaults to None.
-        scheduler (LRScheduler | None, optional): The scheduler to save the state from. Defaults to None.
-        preprocessor: The preprocessor/pipeline to save. Defaults to None.
+        checkpoint_dir: Checkpoint directory (e.g., checkpoints/005000)
+        model_cls: Model class.
+        device: Device to load weights to
+
+    Returns:
+        If model_cls provided: tuple of (model, preprocessor)
+        If model_cls is None: tuple of (config, state_dict, preprocessor)
+
+    Raises:
+        FileNotFoundError: If checkpoint directory or required files don't exist
     """
+    from flagscale.train.processor import PolicyProcessorPipeline
+
+    print(f"Loading checkpoint from {checkpoint_dir}")
+
+    if isinstance(checkpoint_dir, str):
+        checkpoint_dir = Path(checkpoint_dir)
+
     pretrained_dir = checkpoint_dir / PRETRAINED_MODEL_DIR
-    policy.save_pretrained(pretrained_dir)
-    # cfg.save_pretrained(pretrained_dir)
-    # if preprocessor is not None:
-    #     preprocessor.save_pretrained(pretrained_dir)
-    # if postprocessor is not None:
-    #     postprocessor.save_pretrained(pretrained_dir)
-    # save_training_state(checkpoint_dir, step, optimizer, scheduler)
+
+    if not pretrained_dir.is_dir():
+        raise FileNotFoundError(f"Checkpoint directory not found: {pretrained_dir}")
+
+    config_path = pretrained_dir / "train_config.yaml"
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    config = OmegaConf.load(config_path)
+
+    model = model_cls(config)
+
+    weights_path = pretrained_dir / "model.safetensors"
+    if not weights_path.exists():
+        raise FileNotFoundError(f"Weights file not found: {weights_path}")
+    # TODO: (yupu) Some modules could be loaded twice
+    missing_keys, unexpected_keys = load_model(model, weights_path, device=device)
+    if missing_keys:
+        print(f"Warning: Missing keys when loading checkpoint: {len(missing_keys)} keys")
+        if len(missing_keys) <= 10:
+            for key in missing_keys:
+                print(f"  - {key}")
+        else:
+            for key in missing_keys[:10]:
+                print(f"  - {key}")
+            print(f"  ... and {len(missing_keys) - 10} more")
+    if unexpected_keys:
+        print(f"Warning: Unexpected keys in checkpoint: {len(unexpected_keys)} keys")
+        if len(unexpected_keys) <= 10:
+            for key in unexpected_keys:
+                print(f"  - {key}")
+        else:
+            for key in unexpected_keys[:10]:
+                print(f"  - {key}")
+            print(f"  ... and {len(unexpected_keys) - 10} more")
+
+    model.to(device)
+
+    preprocessor = None
+    preprocessor_config_path = pretrained_dir / "policy_preprocessor.json"
+    if preprocessor_config_path.exists():
+        preprocessor = PolicyProcessorPipeline.from_pretrained(
+            pretrained_dir,
+            config_filename="policy_preprocessor.json",
+        )
+
+    postprocessor = None
+    postprocessor_config_path = pretrained_dir / "policy_postprocessor.json"
+    if postprocessor_config_path.exists():
+        postprocessor = PolicyProcessorPipeline.from_pretrained(
+            pretrained_dir,
+            config_filename="policy_postprocessor.json",
+        )
+
+    return model, preprocessor, postprocessor
 
 
 # def save_training_state(
diff --git a/flagscale/train/utils/trainer_tools.py b/flagscale/train/utils/trainer_tools.py
new file mode 100644
index 0000000000..f6de578528
--- /dev/null
+++ b/flagscale/train/utils/trainer_tools.py
@@ -0,0 +1,520 @@
+"""
+metrics.py
+
+Utility classes defining a Metrics container and multiple Trackers to enable model/stage-specific logging to various
+endpoints (e.g., JSONL local logs, Weights & Biases).
+"""
+
+import json
+import re
+
+import numpy as np
+import torch
+from accelerate.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+# === Define Tracker Interface ===
+#
+
+# utils/cli_parser.py
+
+
+def normalize_dotlist_args(args):
+    """
+    Convert ['--x.y', 'val'] and ['--flag'] → ['x.y=val', 'flag=true']
+    """
+    normalized = []
+    skip = False
+    for i in range(len(args)):
+        if skip:
+            skip = False
+            continue
+
+        arg = args[i]
+        if arg.startswith("--"):
+            key = arg.lstrip("-")
+            if "=" in key:
+                normalized.append(key)
+            elif i + 1 < len(args) and not args[i + 1].startswith("--"):
+                normalized.append(f"{key}={args[i + 1]}")
+                skip = True
+            else:
+                normalized.append(f"{key}=true")
+        else:
+            pass  # skip orphaned values
+    return normalized
+
+
+def build_param_lr_groups(model, cfg):
+    """
+    build multiple param groups based on cfg.trainer.learning_rate.
+    support specifying different learning rates for different modules, the rest use base.
+
+    Args:
+        vla: nn.Module model object
+        cfg: config object, requires cfg.trainer.learning_rate dictionary
+
+    Returns:
+        List[Dict]: param_groups that can be used to build optimizer with torch.optim
+    """
+
+    lr_cfg = cfg.trainer.learning_rate
+    base_lr = lr_cfg.get("base", 1e-4)  # default base learning rate
+
+    freeze_modules = cfg.trainer.get("freeze_modules", "")
+    if not isinstance(freeze_modules, str):
+        freeze_modules = ""
+    freeze_patterns = [p.strip() for p in freeze_modules.split(",") if p.strip()]
+
+    used_params = set()
+    frozen_params = set()
+    param_groups = []
+
+    for freeze_path in freeze_patterns:
+        module = model
+        try:
+            for attr in freeze_path.split("."):
+                module = getattr(module, attr)
+            frozen_params.update(id(p) for p in module.parameters())
+        except AttributeError:
+            print(f"⚠️ freeze module path does not exist: {freeze_path}")
+            continue
+
+    for module_name, lr in lr_cfg.items():
+        if module_name == "base":
+            continue
+        # try to find the module under vla by module_name (support nested paths)
+        module = model
+        try:
+            for attr in module_name.split("."):
+                module = getattr(module, attr)
+            # filter out frozen parameters
+            params = [p for p in module.parameters() if id(p) not in frozen_params]
+            if params:  # only add param group if there are trainable parameters
+                param_groups.append({"params": params, "lr": lr, "name": module_name})
+                used_params.update(id(p) for p in params)
+        except AttributeError:
+            ReferenceError(f"⚠️ module path `{module_name}` not found in vla")
+
+    # assign base learning rate to the remaining unused parameters (exclude frozen ones)
+    other_params = [
+        p for p in model.parameters() if id(p) not in used_params and id(p) not in frozen_params
+    ]
+    if other_params:
+        param_groups.append({"params": other_params, "lr": base_lr, "name": "base"})
+
+    return param_groups
+
+
+import torch.distributed as dist
+
+
+def only_main_process(func):
+    """
+    decorator: only run in main process (rank=0)
+    """
+
+    def wrapper(*args, **kwargs):
+        if dist.is_initialized() and dist.get_rank() != 0:
+            return None  # non-main process does not execute
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+from PIL import Image
+from torchvision.ops import box_iou
+
+
+def resize_images(images, target_size=(224, 224)):
+    """
+    recursively resize all images in the nested list.
+
+    :param images: nested list of images or single image.
+    :param target_size: target size (width, height) after resizing.
+    :return: resized images list, keeping the original nested structure.
+    """
+    if isinstance(images, Image.Image):  # if it is a single PIL image
+        return images.resize(target_size)
+    elif isinstance(images, list):  # if it is a list, recursively process each element
+        return [resize_images(img, target_size) for img in images]
+    else:
+        raise ValueError("Unsupported image type or structure.")
+
+
+class TrainerUtils:
+    @staticmethod
+    def freeze_backbones(model, freeze_modules=""):
+        """
+        directly freeze the specified submodules based on the relative module path list (patterns), no longer recursively find all submodule names:
+          - patterns: read from config.system.freeze_modules, separated by commas to get the "relative path" list
+            for example "qwen_vl_interface, action_model.net",
+            it means to freeze model.qwen_vl_interface and model.action_model.net.
+
+        Args:
+            model: nn.Module model object
+            freeze_modules: relative module path list (patterns)
+
+        Returns:
+            model: nn.Module model object
+        return:
+          - model:
+        """
+        frozen = []
+        print("#" * 30)
+        print(freeze_modules)
+        if freeze_modules and isinstance(freeze_modules, str):
+            # split and remove whitespace
+            patterns = (
+                [p.strip() for p in freeze_modules.split(",") if p.strip()]
+                if freeze_modules
+                else []
+            )
+
+            for path in patterns:
+                # split the "relative path" by dots, for example "action_model.net" → ["action_model", "net"]
+                attrs = path.split(".")
+                module = model
+                try:
+                    for attr in attrs:
+                        module = getattr(module, attr)
+                    # if the module is successfully get, freeze it and its all submodule parameters
+                    for param in module.parameters():
+                        param.requires_grad = False
+                    frozen.append(path)
+                except AttributeError:
+                    # if the attribute does not exist, skip and print warning
+                    print(f"⚠️ module path does not exist, cannot freeze: {path}")
+                    continue
+
+        # accelerator.wait_for_everyone()  # synchronize when distributed training
+        if dist.get_rank == 0:
+            print(f"🔒 Frozen modules with re pattern: {frozen}")
+        return model
+
+    @staticmethod
+    def print_trainable_parameters(model):
+        """
+        print the total number of parameters and trainable parameters of the model
+        :param model: PyTorch model instance
+        """
+        if dist.get_rank() != 0:
+            return
+        print("📊 model parameter statistics:")
+        num_params = sum(p.numel() for p in model.parameters())
+        num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print(
+            f"# Parameters (in millions): {num_params / 10**6:.3f} Total, {num_trainable_params / 10**6:.3f} Trainable"
+        )
+        return num_params, num_trainable_params
+
+    @staticmethod
+    def load_pretrained_backbones(model, checkpoint_path=None, reload_modules=None):
+        """
+        load checkpoint:
+        - if reload_modules is set, load by path part
+        - otherwise → load the entire model parameters (overwrite model)
+
+        return:
+            replace, loaded_modules: list of module paths that successfully loaded parameters; if global load, then ["<full_model>"]
+        """
+        if not checkpoint_path:
+            return []
+        if dist.get_rank() == 0:
+            print(f"📦 loading checkpoint: {checkpoint_path}")
+        try:
+            checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        except Exception as e:
+            raise RuntimeError(f"❌ loading checkpoint failed: {e}")
+
+        loaded_modules = []
+
+        if reload_modules:  # partial load
+            module_paths = [p.strip() for p in reload_modules.split(",") if p.strip()]
+            for path in module_paths:
+                reload_modules = path.split(".")
+                module = model
+                try:
+                    for module_name in reload_modules:  # find the module to modify level by level
+                        module = getattr(module, module_name)
+                    prefix = path + "."
+                    sub_state_dict = {
+                        k[len(prefix) :]: v for k, v in checkpoint.items() if k.startswith(prefix)
+                    }
+                    if sub_state_dict:
+                        module.load_state_dict(sub_state_dict, strict=True)
+                        if dist.get_rank() == 0:
+                            print(f"✅ parameters loaded to module '{path}'")
+                        loaded_modules.append(path)
+                    else:
+                        print(f"⚠️ parameters not found in checkpoint '{path}'")
+                except AttributeError:
+                    print(f"❌ cannot find module path: {path}")
+        else:  # full load
+            try:
+                model.load_state_dict(checkpoint, strict=False)
+                if dist.get_rank() == 0:
+                    print("✅ loaded <full_model> model parameters")
+                loaded_modules = ["<full_model>"]
+            except Exception as e:
+                raise RuntimeError(f"❌ loading full model failed: {e}")
+        return model
+
+    @staticmethod
+    def print_freeze_status(model):
+        """
+        print the freezing status of each parameter in the model
+        :param model: PyTorch model instance
+        """
+        for name, param in model.named_parameters():
+            status = "Frozen" if not param.requires_grad else "Trainable"
+            print(f"{name:60s}  |  {status}")
+
+    @staticmethod
+    def setup_distributed_training(accelerator, *components):
+        """
+        use Accelerator to prepare distributed training components
+        :param accelerator: Accelerate instance
+        :param components: any number of components (such as model, optimizer, dataloader, etc.)
+        :return: prepared distributed components (in the same order as input)
+        """
+
+        # use accelerator.prepare method to wrap components
+        prepared_components = accelerator.prepare(*components)
+        return prepared_components
+
+    @staticmethod
+    def euclidean_distance(predicted: np.ndarray, ground_truth: np.ndarray) -> float:
+        return np.linalg.norm(predicted - ground_truth)
+
+    @staticmethod
+    def _reset_dataloader(dataloader, epoch_counter):
+        """safe reset dataloader iterator"""
+        # 1. update epoch counter
+        epoch_counter += 1
+
+        # 2. set new epoch (distributed core)
+        if hasattr(dataloader, "sampler") and callable(
+            getattr(dataloader.sampler, "set_epoch", None)
+        ):
+            dataloader.sampler.set_epoch(epoch_counter)
+
+        # 3. create new iterator
+        return iter(dataloader), epoch_counter
+
+    @staticmethod
+    def compute_grad_angle_with_stats(
+        grads_a: list[torch.Tensor], grads_v: list[torch.Tensor]
+    ) -> tuple[float, float]:
+        """
+        compute the cosine angle between two groups of gradient vectors (degrees), and calculate the average angle and variance.
+        grads_a, grads_v: gradient Tensor list corresponding to the same parameter list interface_params
+        return:
+            mean_angle_deg: average angle (degrees)
+            angle_variance: angle variance
+        """
+        angle_degs = []
+
+        # compute the cosine angle between each gradient block grads_a[0].shape = 1280, 3, 14, 14
+        # grads_1 = grads_a[0][0]  # [3, 14, 14]
+        # grads_2 = grads_v[0][0]
+        # grads_a = grads_1.view(-1, 3)  # reshape to [196, 3]
+        # grads_v = grads_2.view(-1, 3)
+
+        # lang linear
+        # reshape to 14*14, 3
+        # layer
+        grads_action = grads_a[0]  # [2048, 11008]
+        grads_action = grads_action[
+            :32, :7
+        ]  # only take the first 7 elements, avoid cosim failure in high-dimensional space
+        grads_vl = grads_v[0]  # [2048, 11008]
+        grads_vl = grads_vl[
+            :32, :7
+        ]  # only take the first 32 elements, 7 dimensions, avoid cosim failure in high-dimensional space
+        for g_a, g_v in zip(grads_action, grads_vl):
+            dot = torch.sum(g_a * g_v)
+            norm_a_sq = torch.sum(g_a * g_a)
+            norm_v_sq = torch.sum(g_v * g_v)
+
+            # avoid division by zero
+            norm_a = torch.sqrt(norm_a_sq + 1e-16)
+            norm_v = torch.sqrt(norm_v_sq + 1e-16)
+
+            cos_sim = (dot / (norm_a * norm_v)).clamp(-1.0, 1.0)
+            angle_rad = torch.acos(cos_sim)
+            angle_deg = angle_rad * (180.0 / torch.pi)
+
+            angle_degs.append(angle_deg.item())
+
+        # compute the average angle and variance
+        angle_degs_tensor = torch.tensor(angle_degs)
+        mean_angle_deg = torch.mean(angle_degs_tensor).item()
+        angle_variance = torch.sqrt(torch.var(angle_degs_tensor)).item()
+        # accelerator.wait_for_everyone()
+        return mean_angle_deg, angle_variance
+
+    @staticmethod
+    def pcgrad_project(
+        grads_a: list[torch.Tensor], grads_v: list[torch.Tensor]
+    ) -> list[torch.Tensor]:
+        """
+        apply PCGrad projection to the second group of gradients grads_v, suppress negative transfer between grads_a and grads_v
+        if the dot product of two groups of gradients < 0, then:
+            grads_v <- grads_v - (dot / ||grads_a||^2) * grads_a
+        return the new grads_v list
+        """
+        # first compute dot and ||grads_a||^2
+        dot, norm_a_sq = 0.0, 0.0
+        for g_a, g_v in zip(grads_a, grads_v):
+            dot += torch.sum(g_a * g_v)
+            norm_a_sq += torch.sum(g_a * g_a)
+
+        if dot < 0:
+            coeff = dot / (norm_a_sq + 1e-6)
+            # projection
+            grads_v = [g_v - coeff * g_a for g_a, g_v in zip(grads_a, grads_v)]
+
+        return grads_v
+
+    @staticmethod
+    def eval_qwenpi(qwenpi, dataloader, num_batches=20):
+        """
+        evaluate QwenQFormerDiT model, compute IoU and action distance.
+
+        Args:
+            qwenpi: QwenQFormerDiT model instance.
+            dataloader: data loader.
+            num_batches: number of batches to evaluate.
+
+        Returns:
+            dict: contains IoU and action distance evaluation results.
+        """
+        iou_scores = []
+        action_distances = []
+        count = 0
+
+        dataset_iter = iter(dataloader)
+        while count < num_batches:
+            try:
+                batch_samples = next(dataset_iter)
+                count += 1
+            except StopIteration:
+                break
+
+            # extract data
+            images = [example["image"] for example in batch_samples]
+            instructions = [example["lang"] for example in batch_samples]
+            actions = [example["action"] for example in batch_samples]
+            solutions = [example["solution"] for example in batch_samples]
+
+            # model prediction
+            predicted_solutions, normalized_actions = qwenpi.predict_action_withCoT(
+                images=images, instructions=instructions, use_ddim=False, num_ddim_steps=20
+            )
+
+            # extract and convert predicted results
+            parsed_solutions = []
+            for solution in predicted_solutions:
+                parsed_solution = TrainerUtils.extract_json_from_string(solution)
+                parsed_solutions.append(parsed_solution)
+
+            # compute IoU
+            for pred_dict, gt_dict in zip(parsed_solutions, solutions):
+                pred_pick_bbox = torch.tensor(
+                    pred_dict["pick"]["bbox_2d"], dtype=torch.float32
+                ).unsqueeze(0)
+                gt_pick_bbox = torch.tensor(
+                    gt_dict["pick"]["bbox_2d"], dtype=torch.float32
+                ).unsqueeze(0)
+                pred_place_bbox = torch.tensor(
+                    pred_dict["place"]["bbox_2d"], dtype=torch.float32
+                ).unsqueeze(0)
+                gt_place_bbox = torch.tensor(
+                    gt_dict["place"]["bbox_2d"], dtype=torch.float32
+                ).unsqueeze(0)
+
+                pick_iou = box_iou(pred_pick_bbox, gt_pick_bbox).item()
+                place_iou = box_iou(pred_place_bbox, gt_place_bbox).item()
+
+                iou_scores.append({"pick_iou": pick_iou, "place_iou": place_iou})
+
+            # compute action distance
+            actions = np.array(actions)  # convert to numpy array
+            num_pots = np.prod(actions.shape)  # B*len*dim
+            action_distance = TrainerUtils.euclidean_distance(normalized_actions, actions)
+            average_action_distance = action_distance / num_pots
+            action_distances.append(average_action_distance)
+
+        # summarize results
+        avg_action_distance = np.mean(action_distances)
+        return {"iou_scores": iou_scores, "average_action_distance": avg_action_distance}
+
+    @staticmethod
+    def extract_json_from_string(input_string):
+        """
+        extract valid JSON part from string and convert to dictionary.
+
+        Args:
+            input_string (str): string containing extra characters.
+
+        Returns:
+            dict: dictionary extracted and parsed.
+        """
+        json_match = re.search(r"{.*}", input_string, re.DOTALL)
+        if json_match:
+            json_str = json_match.group(0)
+            try:
+                return json.loads(json_str)
+            except json.JSONDecodeError as e:
+                print(f"JSON decode failed: {e}")
+                return None
+        else:
+            print("No valid JSON part found")
+            return None
+
+    def _get_latest_checkpoint(self, checkpoint_dir):
+        """Find the latest checkpoint in the directory based on step number."""
+        if not os.path.exists(checkpoint_dir):
+            self.accelerator.print(f"No checkpoint directory found at {checkpoint_dir}")
+            return None, 0
+
+        # 获取所有符合命名规则，确保只匹配以 .pt 结尾的文件
+        checkpoints = [
+            f
+            for f in os.listdir(checkpoint_dir)
+            if re.match(r"steps_(\d+)_pytorch_model\.pt$", f)  # 添加 $ 确保以 .pt 结尾
+            and os.path.isfile(os.path.join(checkpoint_dir, f))  # 确保是文件
+        ]
+
+        if not checkpoints:
+            self.accelerator.print(f"No checkpoints found in {checkpoint_dir}")
+            return None, 0
+
+        # 提取步数并排序
+        try:
+            checkpoints_with_steps = [
+                (ckpt, int(re.search(r"steps_(\d+)_pytorch_model\.pt", ckpt).group(1)))
+                for ckpt in checkpoints
+            ]
+        except AttributeError as e:
+            self.accelerator.print(f"Error parsing checkpoint filenames: {e}")
+            return None, 0
+
+        # 按步数排序，获取最新的 checkpoint
+        checkpoints_with_steps.sort(key=lambda x: x[1])
+        latest_checkpoint, completed_steps = checkpoints_with_steps[-1]
+
+        latest_checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
+        self.accelerator.print(f"Latest checkpoint found: {latest_checkpoint_path}")
+        return latest_checkpoint_path, completed_steps
+
+
+import os
+
+
+def is_main_process():
+    rank = int(os.environ.get("RANK", 0))  # if RANK is not set, default to 0
+    return rank == 0
diff --git a/tests/unit_tests/inference/test_qwen3_vl_apply_chat_template.py b/tests/unit_tests/inference/test_qwen3_vl_apply_chat_template.py
new file mode 100644
index 0000000000..d54ea20237
--- /dev/null
+++ b/tests/unit_tests/inference/test_qwen3_vl_apply_chat_template.py
@@ -0,0 +1,67 @@
+import os
+
+import pytest
+import torch
+
+from flagscale.models.vlm.qwen3_vl import DEFAULT_IMAGE_TOKEN
+from flagscale.train.utils.image_tools import to_pil_preserve
+
+
+def _load_processor():
+    pytest.importorskip("transformers")
+    from transformers import AutoProcessor
+
+    model_id = os.environ.get("QWEN3_VL_TEST_MODEL", "Qwen/Qwen3-VL-4B-Instruct")
+    try:
+        return AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+    except Exception as exc:
+        pytest.skip(f"Unable to load processor for {model_id}: {exc}")
+
+
+def test_apply_chat_template_batched_images_match_per_sample_messages():
+    processor = _load_processor()
+    batch_size = 2
+    num_images = 2
+    height = 32
+    width = 32
+    images = torch.rand(batch_size, num_images, 3, height, width)
+    pil_images = [
+        [to_pil_preserve(img.permute(1, 2, 0).numpy()) for img in sample] for sample in images
+    ]
+
+    instruction = "Describe."
+    per_sample_messages = []
+    for sample_images in pil_images:
+        content = [{"type": "image", "image": img} for img in sample_images]
+        content.append({"type": "text", "text": instruction})
+        per_sample_messages.append({"role": "user", "content": content})
+
+    rendered_from_messages = processor.apply_chat_template(
+        per_sample_messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    prompt = f"{DEFAULT_IMAGE_TOKEN}\n" * num_images + instruction
+    batched_messages = [
+        {"role": "user", "content": [{"type": "text", "text": prompt}]}
+    ] * batch_size
+    rendered_from_prefix = processor.apply_chat_template(
+        batched_messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    list_inputs = processor(
+        text=rendered_from_messages,
+        images=[img for sample in pil_images for img in sample],
+        padding=True,
+        return_tensors="pt",
+    )
+    batched_inputs = processor(
+        text=rendered_from_prefix,
+        images=images.view(-1, 3, height, width),
+        padding=True,
+        return_tensors="pt",
+    )
+    assert torch.equal(list_inputs["input_ids"], batched_inputs["input_ids"])
diff --git a/tests/unit_tests/models/vla/qwen_gr00t_ref.py b/tests/unit_tests/models/vla/qwen_gr00t_ref.py
new file mode 100644
index 0000000000..7aa818ca7b
--- /dev/null
+++ b/tests/unit_tests/models/vla/qwen_gr00t_ref.py
@@ -0,0 +1,206 @@
+# Mainly adopted from:
+# https://github.com/starVLA/starVLA/blob/3f7feefbc5fc25890ad3a7d262b8a0aea1339aa7/starVLA/model/framework/QwenGR00T.py
+# Below is the original copyright:
+
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Junqiu YU / Fudan University] in [2025].
+# Design and Merged by [Jinhui YE / HKUST University] in [2025].
+
+"""
+Qwen-GR00T Framework
+A lightweight implementation that Qwen-VL + Flow-matching head to directly predict continuous actions
+Flow-matching header is copyright from GR00T N1.5,
+"""
+
+import numpy as np
+import torch
+from transformers import PretrainedConfig, PreTrainedModel
+
+from flagscale.models.action_model.gr00t_action_header import FlowmatchingActionHead
+from flagscale.models.utils.constants import ACTION, OBS_STATE
+
+# from flagscale.models.vlm.qwen2_5_vl import _QWen_VL_Interface
+from flagscale.models.vlm.qwen3_vl import _QWen3_VL_Interface
+from flagscale.train.utils.image_tools import to_pil_preserve
+from flagscale.train.utils.trainer_tools import resize_images
+
+
+class QwenGR00T(PreTrainedModel):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise QFormer for multi-layer feature aggregation
+      - DINO encoder for dense multi-view spatial tokens
+      - DiT diffusion head for future action sequence modeling
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    config_class = PretrainedConfig
+
+    def __init__(
+        self,
+        config: dict | None = None,
+        **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+        super().__init__(PretrainedConfig())
+        self.config = config
+        # self.qwen_vl_interface = _QWen_VL_Interface(config=self.config)
+        self.qwen_vl_interface = _QWen3_VL_Interface(config=self.config)
+        # align dims --> we should put them to config or no?
+        self.config.model.action_model.diffusion_model_cfg.cross_attention_dim = (
+            self.qwen_vl_interface.model.config.hidden_size
+        )
+
+        self.action_model: FlowmatchingActionHead = FlowmatchingActionHead(
+            full_config=self.config
+        )  # 修复后续引用
+
+        self.future_action_window_size = config.model.action_model.future_action_window_size
+        self.past_action_window_size = config.model.action_model.past_action_window_size
+        self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size
+
+    def forward(
+        self,
+        examples: list[dict] | None = None,
+        **kwargs,
+    ) -> tuple:
+        """ """
+        # FIXME: state is None
+        # from torchvision import transforms
+        # image_transform = transforms.ToPILImage()
+
+        # batch_images = [example["image"] for example in examples]  #  [B，[PLT]]
+        # instructions = [example["lang"] for example in examples]  # [B, str]
+        # actions = [example["action"] for example in examples]  # label [B， len, 7]
+
+        actions = examples[ACTION]
+        state = examples[OBS_STATE]
+
+        # state = (
+        #     [example["state"] for example in examples] if "state" in examples[0] else None
+        # )  # [B, 1, state_dim]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            examples=examples,
+            image_keys=self.config.data.vla_data.image_features,
+            # images=batch_images, instructions=instructions
+        )
+
+        # print(f"qwen_inputs: {qwen_inputs}")
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            # last_hidden_state: [B, seq_len, H]
+            last_hidden = qwenvl_outputs.hidden_states[-1]  # [B, L, H]
+
+        # Step 4: Action Expert Forward and Loss
+        with torch.autocast("cuda", dtype=torch.float32):
+            # TODO: (yupu) Is this a bug or a feature? The action dtype would stay as bf16 under this autocast.
+            if isinstance(actions, torch.Tensor):
+                actions = actions.to(device=last_hidden.device, dtype=last_hidden.dtype)
+            else:
+                actions = torch.tensor(
+                    np.array(actions), device=last_hidden.device, dtype=last_hidden.dtype
+                )
+            # TODO: does not match RoboBrainX, need to check
+            # actions = torch.tensor(
+            #     np.array(actions), device=last_hidden.device, dtype=last_hidden.dtype
+            # )  # [B, T_full, action_dim]
+            actions_target = actions[
+                :, -(self.future_action_window_size + 1) :, :
+            ]  # (B, chunk_len, action_dim)
+
+            # TODO: (yupu) I believe there is a bug in starVLA, the
+            # `repeated_diffusion_steps` is not properly set in the config.
+            repeated_diffusion_steps = self.config.model.action_model.get(
+                "repeated_diffusion_steps", 4
+            )
+
+            actions_target_repeated = actions_target.repeat(repeated_diffusion_steps, 1, 1)
+            last_hidden_repeated = last_hidden.repeat(repeated_diffusion_steps, 1, 1)
+
+            state_repeated = None
+            if state is not None:
+                state = state.to(device=last_hidden.device, dtype=last_hidden.dtype)
+                state_repeated = state.repeat(repeated_diffusion_steps, 1, 1)
+
+            action_loss = self.action_model(
+                last_hidden_repeated, actions_target_repeated, state_repeated
+            )  # (B, chunk_len, action_dim)
+
+        return action_loss
+
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: list[dict],
+        **kwargs: str,
+    ) -> np.ndarray:
+        """
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+          6. Return normalized action trajectory
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+        """
+        if type(examples) is not list:
+            examples = [examples]
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  # [B, [PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+
+        state = (
+            [example["state"] for example in examples] if "state" in examples[0] else None
+        )  # [B, 1, state_dim]
+
+        train_obs_image_size = getattr(self.config.data.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images, instructions=instructions
+        )
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+
+            # last_hidden_state: [B, seq_len, H]
+            last_hidden = qwenvl_outputs.hidden_states[-1]  # [B, L, H]
+
+        state = (
+            torch.from_numpy(np.array(state)).to(last_hidden.device, dtype=last_hidden.dtype)
+            if state is not None
+            else None
+        )
+
+        # Step 4: Action Expert Forward
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.predict_action(
+                last_hidden, state
+            )  # (B, chunk_len, action_dim)
+
+        normalized_actions = pred_actions.detach().cpu().numpy()
+        return {"normalized_actions": normalized_actions}
diff --git a/tests/unit_tests/models/vla/test_protocols.py b/tests/unit_tests/models/vla/test_protocols.py
new file mode 100644
index 0000000000..edcd3a51cd
--- /dev/null
+++ b/tests/unit_tests/models/vla/test_protocols.py
@@ -0,0 +1,57 @@
+import unittest
+
+import torch
+
+
+class MockVLM:
+    @property
+    def config(self):
+        return {"hidden_size": 1024}
+
+    def prepare_input(self, batch):
+        return batch
+
+    def forward(self, batch, **kwargs):
+        return {"hidden_states": (torch.randn(1, 10, 1024),)}
+
+
+class MockActionModel:
+    def forward(self, vlm_output, action_input, **kwargs):
+        return {"loss": torch.tensor(0.5)}
+
+    def predict(self, vlm_output, action_input, **kwargs):
+        return {"actions": torch.randn(1, 16, 7)}
+
+
+class TestVLMBackboneProtocol(unittest.TestCase):
+    def test_mock_vlm_has_protocol_methods(self):
+        vlm = MockVLM()
+        self.assertTrue(hasattr(vlm, "config"))
+        self.assertTrue(hasattr(vlm, "prepare_input"))
+        self.assertTrue(hasattr(vlm, "forward"))
+
+        output = vlm.forward({})
+        self.assertIn("hidden_states", output)
+
+
+class TestActionModelProtocol(unittest.TestCase):
+    def test_mock_action_model_has_protocol_methods(self):
+        model = MockActionModel()
+        self.assertTrue(hasattr(model, "forward"))
+        self.assertTrue(hasattr(model, "predict"))
+
+    def test_forward_returns_loss(self):
+        model = MockActionModel()
+        vlm_output = {"hidden_states": (torch.randn(1, 10, 1024),)}
+        action_input = {"actions": torch.randn(1, 16, 7)}
+
+        output = model.forward(vlm_output, action_input)
+        self.assertIn("loss", output)
+
+    def test_predict_returns_actions(self):
+        model = MockActionModel()
+        vlm_output = {"hidden_states": (torch.randn(1, 10, 1024),)}
+        action_input = {}
+
+        pred = model.predict(vlm_output, action_input)
+        self.assertIn("actions", pred)
diff --git a/tests/unit_tests/models/vla/test_qwen_gr00t_parity.py b/tests/unit_tests/models/vla/test_qwen_gr00t_parity.py
new file mode 100644
index 0000000000..0ba0bfddde
--- /dev/null
+++ b/tests/unit_tests/models/vla/test_qwen_gr00t_parity.py
@@ -0,0 +1,153 @@
+import unittest
+
+import torch
+from omegaconf import OmegaConf
+
+from flagscale.models.utils.constants import ACTION, OBS_STATE
+
+
+class TestQwenGR00TParity(unittest.TestCase):
+    """
+    End-to-end parity test between QwenGR00T and QwenGr00t.
+
+    Note: This test requires GPU and the actual model weights.
+    Skip in CI environments without GPU.
+    """
+
+    @unittest.skipIf(not torch.cuda.is_available(), "No GPU available")
+    def test_forward_parity(self):
+        """Test that QwenGr00t produces same loss as QwenGR00T."""
+        from tests.unit_tests.models.vla.qwen_gr00t_ref import QwenGR00T
+
+        from flagscale.models.vla.qwen_gr00t import QwenGr00t
+
+        # Create config
+        config = self._create_test_config()
+
+        # Create both models
+        model_v1 = QwenGR00T(config=config).cuda()
+        model_v2 = QwenGr00t(config=config).cuda()
+
+        # Copy action model weights from v1 to v2
+        model_v2.action_model._head.load_state_dict(model_v1.action_model.state_dict())
+
+        # Create test batch
+        batch = self._create_test_batch()
+
+        # Set same random seed for both
+        torch.manual_seed(42)
+        loss_v1 = model_v1.forward(batch)
+
+        torch.manual_seed(42)
+        loss_v2 = model_v2.forward(batch)
+
+        # Compare losses
+        self.assertTrue(
+            torch.allclose(loss_v1, loss_v2, atol=1e-5),
+            f"Loss mismatch: v1={loss_v1.item()}, v2={loss_v2.item()}",
+        )
+
+    def _create_test_config(self):
+        """Create config matching examples/qwen_gr00t/conf/train/qwen_gr00t.yaml."""
+        config_dict = {
+            "model": {
+                "model_name": "qwen_gr00t",
+                "checkpoint_dir": "/workspace/models/Qwen/Qwen3-VL-4B-Instruct/",
+                "vlm": {
+                    "type": "qwen3-vl",
+                },
+                "qwenvl": {
+                    "base_vlm": "/workspace/models/Qwen/Qwen3-VL-4B-Instruct/",
+                    "attn_implementation": "flash_attention_2",
+                    "vl_hidden_dim": 2048,
+                },
+                "action_model": {
+                    "type": "flow_matching",
+                    "action_model_type": "DiT-B",
+                    "action_hidden_dim": 1024,
+                    "hidden_size": 1024,
+                    "add_pos_embed": True,
+                    "max_seq_len": 1024,
+                    "action_dim": 7,
+                    "state_dim": 8,
+                    "future_action_window_size": 7,
+                    "action_horizon": 8,
+                    "past_action_window_size": 0,
+                    "repeated_diffusion_steps": 4,
+                    "noise_beta_alpha": 1.5,
+                    "noise_beta_beta": 1.0,
+                    "noise_s": 0.999,
+                    "num_timestep_buckets": 1000,
+                    "num_inference_timesteps": 4,
+                    "num_target_vision_tokens": 32,
+                    "diffusion_model_cfg": {
+                        "cross_attention_dim": 2048,
+                        "dropout": 0.2,
+                        "final_dropout": True,
+                        "interleave_self_attention": True,
+                        "norm_type": "ada_norm",
+                        "num_layers": 16,
+                        "output_dim": 1024,
+                        "positional_embeddings": None,
+                    },
+                },
+                "reduce_in_full_precision": True,
+            },
+            "data": {
+                "data_path": "",
+                "vla_data": {
+                    "image_features": [
+                        "observation.images.image",
+                        "observation.images.wrist_image",
+                    ],
+                },
+            },
+            "system": {
+                "batch_size": 16,
+                "train_steps": 80000,
+                "log_freq": 10,
+                "grad_clip_norm": 1.0,
+                "optimizer": {"name": "AdamW", "lr": 2.5e-5},
+                "scheduler": {"warmup_steps": 5000},
+                "checkpoint": {
+                    "save_checkpoint": False,
+                    "save_freq": 1000,
+                    "output_directory": "/tmp",
+                },
+            },
+        }
+        return OmegaConf.create(config_dict)
+
+    def _create_test_batch(self):
+        """
+        Create test batch matching actual training data format.
+
+        Actual batch structure:
+        - action: [16, 8, 7] float32
+        - task: list of 16 strings
+        - observation.images.wrist_image: [16, 3, 224, 224] float32
+        - observation.images.image: [16, 3, 224, 224] float32
+        - observation.state: [16, 1, 8] float32
+        """
+        batch_size = 16
+        action_horizon = 8
+        action_dim = 7
+        state_dim = 8
+        img_channels = 3
+        img_size = 224
+
+        return {
+            ACTION: torch.randn(batch_size, action_horizon, action_dim, dtype=torch.float32),
+            "task": ["put the bowl on the plate"] * batch_size,
+            "observation.images.image": torch.randn(
+                batch_size, img_channels, img_size, img_size, dtype=torch.float32
+            ),
+            "observation.images.wrist_image": torch.randn(
+                batch_size, img_channels, img_size, img_size, dtype=torch.float32
+            ),
+            OBS_STATE: torch.randn(batch_size, 1, state_dim, dtype=torch.float32),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit_tests/models/vla/test_registry.py b/tests/unit_tests/models/vla/test_registry.py
new file mode 100644
index 0000000000..4a4a91baea
--- /dev/null
+++ b/tests/unit_tests/models/vla/test_registry.py
@@ -0,0 +1,41 @@
+import unittest
+
+from flagscale.models.vla.registry import (
+    ACTION_MODEL_REGISTRY,
+    VLM_REGISTRY,
+    build_action_model,
+    build_vlm,
+    register_action_model,
+    register_vlm,
+)
+
+
+class TestRegistry(unittest.TestCase):
+    def test_register_vlm(self):
+        @register_vlm("test-vlm")
+        class TestVLM:
+            def __init__(self, **kwargs):
+                self.kwargs = kwargs
+
+        self.assertIn("test-vlm", VLM_REGISTRY)
+        vlm = build_vlm("test-vlm", model_id="test")
+        self.assertEqual(vlm.kwargs["model_id"], "test")
+
+    def test_register_action_model(self):
+        @register_action_model("test-model")
+        class TestModel:
+            def __init__(self, vlm_config, action_config):
+                self.vlm_config = vlm_config
+                self.action_config = action_config
+
+        self.assertIn("test-model", ACTION_MODEL_REGISTRY)
+        model = build_action_model("test-model", vlm_config={}, action_config={"action_dim": 7})
+        self.assertEqual(model.action_config["action_dim"], 7)
+
+    def test_build_unknown_vlm_raises(self):
+        with self.assertRaises(ValueError):
+            build_vlm("nonexistent-vlm-xyz")
+
+    def test_build_unknown_action_model_raises(self):
+        with self.assertRaises(ValueError):
+            build_action_model("nonexistent-model-xyz", vlm_config={}, action_config={})
diff --git a/tests/unit_tests/models/vla/test_utils.py b/tests/unit_tests/models/vla/test_utils.py
new file mode 100644
index 0000000000..0ff384d481
--- /dev/null
+++ b/tests/unit_tests/models/vla/test_utils.py
@@ -0,0 +1,34 @@
+import unittest
+
+from flagscale.models.vla.utils import get_vlm_config
+
+
+class MockConfigDirect:
+    hidden_size = 2048
+    num_hidden_layers = 28
+
+
+class MockConfigNested:
+    class text_config:
+        hidden_size = 1536
+        num_hidden_layers = 24
+
+
+class MockConfigInvalid:
+    pass
+
+
+class TestGetVlmConfig(unittest.TestCase):
+    def test_direct_config(self):
+        info = get_vlm_config(MockConfigDirect())
+        self.assertEqual(info["hidden_size"], 2048)
+        self.assertEqual(info["num_hidden_layers"], 28)
+
+    def test_nested_config(self):
+        info = get_vlm_config(MockConfigNested())
+        self.assertEqual(info["hidden_size"], 1536)
+        self.assertEqual(info["num_hidden_layers"], 24)
+
+    def test_invalid_config_raises(self):
+        with self.assertRaises(ValueError):
+            get_vlm_config(MockConfigInvalid())
diff --git a/tests/unit_tests/models/vla/vlm/test_qwen_processor_parity.py b/tests/unit_tests/models/vla/vlm/test_qwen_processor_parity.py
new file mode 100644
index 0000000000..f5ce295e83
--- /dev/null
+++ b/tests/unit_tests/models/vla/vlm/test_qwen_processor_parity.py
@@ -0,0 +1,150 @@
+"""End-to-end parity test: PIL pipeline vs tensor pipeline for Qwen processor.
+
+Verifies that building processor inputs via:
+  Path A: tensor -> PIL -> messages -> apply_chat_template(tokenize=True)
+  Path B: tensor -> processor(text=..., images=..., do_rescale=False)
+produces identical input_ids, attention_mask, and pixel_values.
+
+Usage:
+    pytest test_qwen_processor_parity.py \
+        --model-id /path/to/Qwen3-VL \
+        --batch-path /path/to/batch.pt
+"""
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image
+from transformers import AutoProcessor
+
+from flagscale.models.vla.vlm.qwen_vl import build_processor_inputs
+
+IMAGE_KEYS = ["observation.images.image", "observation.images.wrist_image"]
+
+
+# def pytest_addoption(parser):
+#     parser.addoption("--model-id", required=True, help="Path to Qwen VL model")
+#     parser.addoption("--batch-path", required=True, help="Path to saved batch .pt file")
+
+
+@pytest.fixture(scope="session")
+def processor(request):
+    model_id = request.config.getoption("--model-id")
+    proc = AutoProcessor.from_pretrained(model_id)
+    proc.tokenizer.padding_side = "left"
+    return proc
+
+
+@pytest.fixture(scope="session")
+def batch(request):
+    batch_path = request.config.getoption("--batch-path")
+    return torch.load(batch_path, weights_only=False)
+
+
+# ── Pipeline implementations ────────────────────────────────────────────
+
+
+def to_pil_preserve(arr: np.ndarray) -> Image.Image:
+    arr = np.clip(arr, 0.0, 1.0)
+    arr = (arr * 255.0 + 0.5).astype(np.uint8)
+    return Image.fromarray(arr, mode="RGB")
+
+
+def _tensor_to_pil_list(batch_tensor: torch.Tensor) -> list:
+    if not isinstance(batch_tensor, torch.Tensor):
+        return batch_tensor
+    if batch_tensor.ndim == 3:
+        batch_tensor = batch_tensor.unsqueeze(0)
+    pil_images = []
+    for item in batch_tensor:
+        if item.shape[-1] in (1, 3, 4):
+            img = item
+        else:
+            img = item.permute(1, 2, 0)
+        pil_images.append(to_pil_preserve(img.detach().cpu().numpy()))
+    return pil_images
+
+
+def run_path_a(processor, batch: dict) -> dict:
+    """Current pipeline: tensor -> PIL -> messages -> apply_chat_template(tokenize=True)"""
+    instructions = batch["task"]
+    if isinstance(instructions, torch.Tensor):
+        instructions = instructions.detach().cpu().tolist()
+    if isinstance(instructions, str):
+        instructions = [instructions]
+
+    batch_images = None
+    for key in IMAGE_KEYS:
+        key_images = _tensor_to_pil_list(batch[key])
+        if batch_images is None:
+            batch_images = [[img] for img in key_images]
+        else:
+            for sample_images, img in zip(batch_images, key_images):
+                sample_images.append(img)
+
+    messages = []
+    for imgs, instruction in zip(batch_images, instructions):
+        content = [{"type": "image", "image": img} for img in imgs]
+        content.append({"type": "text", "text": instruction})
+        messages.append([{"role": "user", "content": content}])
+
+    return processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        padding=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+
+def run_path_b(processor, batch: dict) -> dict:
+    """Production path: uses build_processor_inputs from qwen_vl.py"""
+    return build_processor_inputs(processor, batch, IMAGE_KEYS)
+
+
+# ── Tests ────────────────────────────────────────────────────────────────
+
+
+@pytest.fixture(scope="session")
+def results(processor, batch):
+    a = run_path_a(processor, batch)
+    b = run_path_b(processor, batch)
+    return a, b
+
+
+def test_output_keys_match(results):
+    result_a, result_b = results
+    assert set(result_a.keys()) == set(result_b.keys())
+
+
+@pytest.mark.parametrize("key", ["input_ids", "attention_mask"])
+def test_integer_tensor_exact_match(results, key):
+    result_a, result_b = results
+    if key not in result_a:
+        pytest.skip(f"'{key}' not in output")
+    a, b = result_a[key].cpu(), result_b[key].cpu()
+    assert a.shape == b.shape, f"shape mismatch: {list(a.shape)} vs {list(b.shape)}"
+    assert torch.equal(a, b), f"value mismatch: {(a != b).sum().item()} elements differ"
+
+
+def test_pixel_values_close(results):
+    # Path A quantizes to uint8 then back to float, so small diffs are expected
+    result_a, result_b = results
+    key = "pixel_values"
+    if key not in result_a:
+        pytest.skip("'pixel_values' not in output")
+    a, b = result_a[key].cpu().float(), result_b[key].cpu().float()
+    assert a.shape == b.shape, f"shape mismatch: {list(a.shape)} vs {list(b.shape)}"
+    diff = (a - b).abs()
+    assert torch.allclose(a, b, atol=0.1), f"max diff={diff.max():.6f}, mean diff={diff.mean():.6f}"
+
+
+def test_image_grid_thw_match(results):
+    result_a, result_b = results
+    key = "image_grid_thw"
+    if key not in result_a:
+        pytest.skip("'image_grid_thw' not in output")
+    a, b = result_a[key].cpu(), result_b[key].cpu()
+    assert a.shape == b.shape, f"shape mismatch: {list(a.shape)} vs {list(b.shape)}"
+    assert torch.equal(a, b)
diff --git a/tests/unit_tests/models/vla/vlm/test_qwen_vl.py b/tests/unit_tests/models/vla/vlm/test_qwen_vl.py
new file mode 100644
index 0000000000..13c56e7187
--- /dev/null
+++ b/tests/unit_tests/models/vla/vlm/test_qwen_vl.py
@@ -0,0 +1,29 @@
+import unittest
+
+from flagscale.models.vla.registry import VLM_REGISTRY
+
+
+class TestQwenVLRegistration(unittest.TestCase):
+    def test_qwen25_vl_registered(self):
+        from flagscale.models.vla.vlm import qwen_vl  # noqa: F401
+
+        self.assertIn("qwen2.5-vl", VLM_REGISTRY)
+
+    def test_qwen3_vl_registered(self):
+        from flagscale.models.vla.vlm import qwen_vl  # noqa: F401
+
+        self.assertIn("qwen3-vl", VLM_REGISTRY)
+
+    def test_qwen25_has_required_methods(self):
+        from flagscale.models.vla.vlm.qwen_vl import Qwen25VLBackbone
+
+        self.assertTrue(hasattr(Qwen25VLBackbone, "model_config"))
+        self.assertTrue(hasattr(Qwen25VLBackbone, "prepare_input"))
+        self.assertTrue(hasattr(Qwen25VLBackbone, "forward"))
+
+    def test_qwen3_has_required_methods(self):
+        from flagscale.models.vla.vlm.qwen_vl import Qwen3VLBackbone
+
+        self.assertTrue(hasattr(Qwen3VLBackbone, "model_config"))
+        self.assertTrue(hasattr(Qwen3VLBackbone, "prepare_input"))
+        self.assertTrue(hasattr(Qwen3VLBackbone, "forward"))
diff --git a/tests/unit_tests/models/vla/vlm/test_vlm_init.py b/tests/unit_tests/models/vla/vlm/test_vlm_init.py
new file mode 100644
index 0000000000..61087f33f0
--- /dev/null
+++ b/tests/unit_tests/models/vla/vlm/test_vlm_init.py
@@ -0,0 +1,9 @@
+import unittest
+
+
+class TestVLMInit(unittest.TestCase):
+    def test_imports(self):
+        from flagscale.models.vla.vlm import Qwen3VLBackbone, Qwen25VLBackbone
+
+        self.assertIsNotNone(Qwen25VLBackbone)
+        self.assertIsNotNone(Qwen3VLBackbone)
diff --git a/tests/unit_tests/train/utils/test_optim_setup.py b/tests/unit_tests/train/utils/test_optim_setup.py
new file mode 100644
index 0000000000..b624416f5a
--- /dev/null
+++ b/tests/unit_tests/train/utils/test_optim_setup.py
@@ -0,0 +1,737 @@
+"""Unit tests for optimizer setup utilities."""
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+import torch
+import torch.nn as nn
+
+from flagscale.train.utils.optim_setup import (
+    apply_freeze_config,
+    build_optim_param_groups,
+    freeze_and_get_trainable_params,
+    log_trainable_params,
+    print_param_names,
+    setup_optimizer_and_scheduler,
+    setup_scheduler,
+)
+
+
+class SimpleModel(nn.Module):
+    """Simple model for testing freeze patterns."""
+
+    def __init__(self):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(10, 20),
+            nn.ReLU(),
+            nn.Linear(20, 10),
+        )
+        self.decoder = nn.Sequential(
+            nn.Linear(10, 20),
+            nn.ReLU(),
+            nn.Linear(20, 10),
+        )
+        self.head = nn.Linear(10, 5)
+
+    def forward(self, x):
+        x = self.encoder(x)
+        x = self.decoder(x)
+        return self.head(x)
+
+
+class NestedModel(nn.Module):
+    """Model with nested structure similar to QwenGR00T."""
+
+    def __init__(self):
+        super().__init__()
+        self.vlm = nn.ModuleDict(
+            {
+                "visual": nn.Sequential(
+                    nn.Linear(10, 20),
+                    nn.Linear(20, 10),
+                ),
+                "language": nn.ModuleDict(
+                    {
+                        "layers": nn.ModuleList([nn.Linear(10, 10) for _ in range(5)]),
+                        "embed": nn.Embedding(100, 10),
+                    }
+                ),
+            }
+        )
+        self.action_model = nn.ModuleDict(
+            {
+                "encoder": nn.Linear(10, 20),
+                "decoder": nn.Linear(20, 10),
+                "transformer_blocks": nn.ModuleList([nn.Linear(10, 10) for _ in range(4)]),
+            }
+        )
+
+    def forward(self, x):
+        return x
+
+
+class TestFreezeAndGetTrainableParams(unittest.TestCase):
+    """Test freeze_and_get_trainable_params function."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+
+    def test_no_patterns_all_trainable(self):
+        """Without patterns, all params should be trainable."""
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=None,
+                keep_patterns=None,
+            )
+        )
+
+        all_params = list(self.model.parameters())
+        self.assertEqual(len(params), len(all_params))
+
+        for param in self.model.parameters():
+            self.assertTrue(param.requires_grad)
+
+    def test_freeze_single_module(self):
+        """Test freezing a single module by pattern."""
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["encoder\\..*"],
+                keep_patterns=None,
+            )
+        )
+
+        # Check encoder is frozen
+        for name, param in self.model.named_parameters():
+            if name.startswith("encoder"):
+                self.assertFalse(param.requires_grad, f"{name} should be frozen")
+            else:
+                self.assertTrue(param.requires_grad, f"{name} should be trainable")
+
+        # Returned params should only be trainable ones
+        for param in params:
+            self.assertTrue(param.requires_grad)
+
+    def test_freeze_multiple_modules(self):
+        """Test freezing multiple modules."""
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["encoder\\..*", "decoder\\..*"],
+                keep_patterns=None,
+            )
+        )
+
+        # Only head should be trainable
+        for name, param in self.model.named_parameters():
+            if name.startswith("head"):
+                self.assertTrue(param.requires_grad)
+            else:
+                self.assertFalse(param.requires_grad)
+
+        # Returned params should only be head params
+        head_param_count = sum(
+            1 for name, _ in self.model.named_parameters() if name.startswith("head")
+        )
+        self.assertEqual(len(params), head_param_count)
+
+    def test_freeze_all_pattern(self):
+        """Test freezing everything with '.*' pattern."""
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=[".*"],
+                keep_patterns=None,
+            )
+        )
+
+        self.assertEqual(len(params), 0)
+        for param in self.model.parameters():
+            self.assertFalse(param.requires_grad)
+
+    def test_keep_patterns_override_freeze(self):
+        """Test that keep_patterns override freeze_patterns."""
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=[".*"],  # Freeze everything
+                keep_patterns=["head\\..*"],  # But keep head trainable
+            )
+        )
+
+        # Only head should be trainable
+        for name, param in self.model.named_parameters():
+            if name.startswith("head"):
+                self.assertTrue(param.requires_grad, f"{name} should be trainable")
+            else:
+                self.assertFalse(param.requires_grad, f"{name} should be frozen")
+
+        # Should only return head params
+        self.assertEqual(len(params), 2)  # head.weight and head.bias
+
+    def test_partial_pattern_match(self):
+        """Test that patterns use search (partial match)."""
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["weight"],  # Matches all weights
+                keep_patterns=None,
+            )
+        )
+
+        # Only biases should be trainable
+        for name, param in self.model.named_parameters():
+            if "weight" in name:
+                self.assertFalse(param.requires_grad)
+            else:
+                self.assertTrue(param.requires_grad)
+
+        # Returned params should only be biases
+        bias_param_count = sum(
+            1 for name, _ in self.model.named_parameters() if "weight" not in name
+        )
+        self.assertEqual(len(params), bias_param_count)
+
+
+class TestFreezeWithNestedModel(unittest.TestCase):
+    """Test freeze patterns with nested model structure."""
+
+    def setUp(self):
+        self.model = NestedModel()
+
+    def test_freeze_vlm_module(self):
+        """Test freezing entire VLM module."""
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["vlm\\..*"],
+                keep_patterns=None,
+            )
+        )
+
+        for name, param in self.model.named_parameters():
+            if name.startswith("vlm"):
+                self.assertFalse(param.requires_grad, f"{name} should be frozen")
+            else:
+                self.assertTrue(param.requires_grad, f"{name} should be trainable")
+
+        # Returned params should only be action_model params
+        action_model_param_count = sum(
+            1 for name, _ in self.model.named_parameters() if name.startswith("action_model")
+        )
+        self.assertEqual(len(params), action_model_param_count)
+
+    def test_freeze_specific_layers(self):
+        """Test freezing specific layers by index."""
+        # Freeze layers 0-2
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["vlm\\.language\\.layers\\.[0-2]\\..*"],
+                keep_patterns=None,
+            )
+        )
+
+        for name, param in self.model.named_parameters():
+            if (
+                "vlm.language.layers.0" in name
+                or "vlm.language.layers.1" in name
+                or "vlm.language.layers.2" in name
+            ):
+                self.assertFalse(param.requires_grad, f"{name} should be frozen")
+
+        # Layers 3-4 should still be trainable
+        for name, param in self.model.named_parameters():
+            if "vlm.language.layers.3" in name or "vlm.language.layers.4" in name:
+                self.assertTrue(param.requires_grad, f"{name} should be trainable")
+
+        # Returned params should exclude frozen layers
+        trainable_param_count = sum(
+            1 for name, param in self.model.named_parameters() if param.requires_grad
+        )
+        self.assertEqual(len(params), trainable_param_count)
+
+    def test_freeze_vlm_keep_visual(self):
+        """Test freezing VLM but keeping visual encoder trainable."""
+        params = list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["vlm\\..*"],
+                keep_patterns=["vlm\\.visual\\..*"],
+            )
+        )
+
+        for name, param in self.model.named_parameters():
+            if name.startswith("vlm.visual"):
+                self.assertTrue(param.requires_grad, f"{name} should be trainable")
+            elif name.startswith("vlm"):
+                self.assertFalse(param.requires_grad, f"{name} should be frozen")
+
+        # Returned params should include visual and action_model params
+        trainable_param_count = sum(
+            1 for name, param in self.model.named_parameters() if param.requires_grad
+        )
+        self.assertEqual(len(params), trainable_param_count)
+
+
+class TestApplyFreezeConfig(unittest.TestCase):
+    """Test apply_freeze_config function."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+
+    def test_none_config_returns_all_params(self):
+        """With None config, should return all parameters."""
+        params = apply_freeze_config(self.model, None)
+
+        all_params = list(self.model.parameters())
+        self.assertEqual(len(params), len(all_params))
+
+    def test_with_freeze_config(self):
+        """Test with a FreezeConfig-like object."""
+        freeze_config = MagicMock()
+        freeze_config.freeze_patterns = ["encoder\\..*"]
+        freeze_config.keep_patterns = None
+
+        params = apply_freeze_config(self.model, freeze_config)
+
+        # Should only return non-encoder params
+        encoder_param_count = sum(
+            1 for name, _ in self.model.named_parameters() if name.startswith("encoder")
+        )
+        total_param_count = sum(1 for _ in self.model.parameters())
+
+        self.assertEqual(len(params), total_param_count - encoder_param_count)
+
+
+class TestLogTrainableParams(unittest.TestCase):
+    """Test log_trainable_params function."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+
+    def test_all_trainable(self):
+        """Test logging when all params are trainable."""
+        result = log_trainable_params(self.model)
+
+        self.assertIn("trainable", result)
+        self.assertIn("frozen", result)
+        self.assertIn("encoder", result["trainable"])
+        self.assertIn("decoder", result["trainable"])
+        self.assertIn("head", result["trainable"])
+
+    def test_partial_frozen(self):
+        """Test logging with some frozen params."""
+        # Freeze encoder
+        for name, param in self.model.named_parameters():
+            if name.startswith("encoder"):
+                param.requires_grad = False
+
+        result = log_trainable_params(self.model)
+
+        self.assertIn("encoder", result["frozen"])
+        self.assertIn("decoder", result["trainable"])
+        self.assertIn("head", result["trainable"])
+        self.assertGreater(result["frozen"]["encoder"], 0)
+
+
+class TestUnusedPatternWarnings(unittest.TestCase):
+    """Test that unused patterns trigger warnings."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+
+    @patch("flagscale.train.utils.optim_setup.logger")
+    def test_warns_on_unused_freeze_pattern(self, mock_logger):
+        """Should warn when freeze pattern matches nothing."""
+        list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["nonexistent_module\\..*"],
+                keep_patterns=None,
+            )
+        )
+
+        mock_logger.warning.assert_called()
+        warning_call = mock_logger.warning.call_args[0][0]
+        self.assertIn("Freeze patterns matched nothing", warning_call)
+
+    @patch("flagscale.train.utils.optim_setup.logger")
+    def test_warns_on_unused_keep_pattern(self, mock_logger):
+        """Should warn when keep pattern matches nothing."""
+        list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["encoder\\..*"],
+                keep_patterns=["nonexistent_module\\..*"],
+            )
+        )
+
+        mock_logger.warning.assert_called()
+        warning_call = mock_logger.warning.call_args[0][0]
+        self.assertIn("Keep patterns matched nothing", warning_call)
+
+
+class TestPrintParamNames(unittest.TestCase):
+    """Test print_param_names debug helper."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+
+    @patch("builtins.print")
+    def test_prints_all_params(self, mock_print):
+        """Should print all params when no pattern given."""
+        print_param_names(self.model)
+
+        self.assertGreater(mock_print.call_count, 0)
+
+    @patch("builtins.print")
+    def test_filters_by_pattern(self, mock_print):
+        """Should only print params matching pattern."""
+        print_param_names(self.model, pattern="encoder")
+
+        # Should only print encoder params
+        for call in mock_print.call_args_list:
+            self.assertIn("encoder", call[0][0])
+
+
+class TestParameterCounts(unittest.TestCase):
+    """Test that parameter counts are correctly reported."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+
+    @patch("flagscale.train.utils.optim_setup.logger")
+    def test_parameter_count_logging(self, mock_logger):
+        """Verify correct parameter counts are logged."""
+        # Count total params
+        total_params = sum(p.numel() for p in self.model.parameters())
+
+        # Count encoder params
+        encoder_params = sum(
+            p.numel() for name, p in self.model.named_parameters() if name.startswith("encoder")
+        )
+
+        # Freeze encoder
+        list(
+            freeze_and_get_trainable_params(
+                self.model.named_parameters(),
+                freeze_patterns=["encoder\\..*"],
+                keep_patterns=None,
+            )
+        )
+
+        # Check that info was logged with correct counts
+        mock_logger.info.assert_called()
+        info_call = mock_logger.info.call_args[0][0]
+        self.assertIn(f"trainable={total_params - encoder_params:,}", info_call)
+        self.assertIn(f"frozen={encoder_params:,}", info_call)
+
+
+class TestBuildOptimParamGroups(unittest.TestCase):
+    """Test build_optim_param_groups function (NeMo-style per-module config)."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+
+    def test_none_config_returns_single_group(self):
+        """With None config, should return single group with all params."""
+        param_groups = build_optim_param_groups(self.model, None)
+
+        self.assertEqual(len(param_groups), 1)
+        all_params = list(self.model.parameters())
+        self.assertEqual(len(param_groups[0]["params"]), len(all_params))
+
+    def test_single_module_config(self):
+        """Test with config for single module."""
+        config = {"encoder": {"lr": 1e-5}}
+        param_groups = build_optim_param_groups(self.model, config)
+
+        # Should have 2 groups: default + encoder
+        self.assertEqual(len(param_groups), 2)
+
+        # Find encoder group
+        encoder_group = next(g for g in param_groups if g.get("name") == "encoder")
+        self.assertEqual(encoder_group["lr"], 1e-5)
+
+        # Encoder params count
+        encoder_param_count = sum(
+            1 for name, _ in self.model.named_parameters() if name.startswith("encoder")
+        )
+        self.assertEqual(len(encoder_group["params"]), encoder_param_count)
+
+    def test_multiple_module_config(self):
+        """Test with config for multiple modules."""
+        config = {
+            "encoder": {"lr": 1e-5, "weight_decay": 0.01},
+            "decoder": {"lr": 2e-5},
+        }
+        param_groups = build_optim_param_groups(self.model, config)
+
+        # Should have 3 groups: default + encoder + decoder
+        self.assertEqual(len(param_groups), 3)
+
+        encoder_group = next(g for g in param_groups if g.get("name") == "encoder")
+        decoder_group = next(g for g in param_groups if g.get("name") == "decoder")
+
+        self.assertEqual(encoder_group["lr"], 1e-5)
+        self.assertEqual(encoder_group["weight_decay"], 0.01)
+        self.assertEqual(decoder_group["lr"], 2e-5)
+
+    def test_default_group_contains_remaining_params(self):
+        """Default group should contain params not in other groups."""
+        config = {"encoder": {"lr": 1e-5}}
+        param_groups = build_optim_param_groups(self.model, config)
+
+        default_group = next(g for g in param_groups if g.get("name") == "default")
+
+        # Default should contain decoder + head params
+        non_encoder_count = sum(
+            1 for name, _ in self.model.named_parameters() if not name.startswith("encoder")
+        )
+        self.assertEqual(len(default_group["params"]), non_encoder_count)
+
+    def test_respects_requires_grad(self):
+        """Should only include trainable params."""
+        # Freeze encoder
+        for name, param in self.model.named_parameters():
+            if name.startswith("encoder"):
+                param.requires_grad = False
+
+        config = {"encoder": {"lr": 1e-5}}
+        param_groups = build_optim_param_groups(self.model, config)
+
+        # Encoder group should be empty (no trainable params)
+        encoder_groups = [g for g in param_groups if g.get("name") == "encoder"]
+        # Either no encoder group, or encoder group has no params
+        if encoder_groups:
+            self.assertEqual(len(encoder_groups[0]["params"]), 0)
+
+    @patch("flagscale.train.utils.optim_setup.logger")
+    def test_warns_on_nonexistent_module(self, mock_logger):
+        """Should warn when module doesn't exist."""
+        config = {"nonexistent": {"lr": 1e-5}}
+        build_optim_param_groups(self.model, config)
+
+        mock_logger.warning.assert_called()
+        warning_call = mock_logger.warning.call_args[0][0]
+        self.assertIn("nonexistent", warning_call)
+
+
+class TestBuildOptimParamGroupsNested(unittest.TestCase):
+    """Test build_optim_param_groups with nested model structure."""
+
+    def setUp(self):
+        self.model = NestedModel()
+
+    def test_nested_module_path(self):
+        """Test accessing nested modules via dot path."""
+        config = {"vlm.visual": {"lr": 1e-5}}
+        param_groups = build_optim_param_groups(self.model, config)
+
+        visual_group = next(g for g in param_groups if g.get("name") == "vlm.visual")
+        self.assertEqual(visual_group["lr"], 1e-5)
+
+        # Count visual params
+        visual_param_count = sum(
+            1 for name, _ in self.model.named_parameters() if name.startswith("vlm.visual")
+        )
+        self.assertEqual(len(visual_group["params"]), visual_param_count)
+
+    def test_multiple_nested_paths(self):
+        """Test multiple nested module configs."""
+        config = {
+            "vlm.visual": {"lr": 1e-5},
+            "vlm.language": {"lr": 2e-5},
+            "action_model": {"lr": 1e-4},
+        }
+        param_groups = build_optim_param_groups(self.model, config)
+
+        # 3 configured groups + default (though default may be empty)
+        groups_with_params = [g for g in param_groups if len(g["params"]) > 0]
+        self.assertGreaterEqual(len(groups_with_params), 3)
+
+
+class TestSetupScheduler(unittest.TestCase):
+    """Test setup_scheduler function."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
+
+    def test_cosine_scheduler(self):
+        """Test creating a cosine scheduler."""
+        scheduler_config = MagicMock()
+        scheduler_config.name = "cosine"
+        scheduler_config.warmup_steps = 100
+        scheduler_config.scheduler_kwargs = None
+
+        scheduler = setup_scheduler(self.optimizer, scheduler_config, num_training_steps=1000)
+
+        self.assertIsNotNone(scheduler)
+        self.assertTrue(hasattr(scheduler, "step"))
+
+    def test_linear_scheduler(self):
+        """Test creating a linear scheduler."""
+        scheduler_config = MagicMock()
+        scheduler_config.name = "linear"
+        scheduler_config.warmup_steps = 50
+        scheduler_config.scheduler_kwargs = None
+
+        scheduler = setup_scheduler(self.optimizer, scheduler_config, num_training_steps=500)
+
+        self.assertIsNotNone(scheduler)
+
+    def test_constant_with_warmup_scheduler(self):
+        """Test creating a constant_with_warmup scheduler."""
+        scheduler_config = MagicMock()
+        scheduler_config.name = "constant_with_warmup"
+        scheduler_config.warmup_steps = 100
+        scheduler_config.scheduler_kwargs = None
+
+        scheduler = setup_scheduler(self.optimizer, scheduler_config, num_training_steps=1000)
+
+        self.assertIsNotNone(scheduler)
+
+    def test_cosine_with_min_lr(self):
+        """Test creating a cosine scheduler with min_lr."""
+        scheduler_config = MagicMock()
+        scheduler_config.name = "cosine_with_min_lr"
+        scheduler_config.warmup_steps = 100
+        scheduler_config.scheduler_kwargs = {"min_lr": 1e-6}
+
+        scheduler = setup_scheduler(self.optimizer, scheduler_config, num_training_steps=1000)
+
+        self.assertIsNotNone(scheduler)
+
+    def test_raises_error_when_name_is_none(self):
+        """Should raise ValueError when scheduler name is None."""
+        scheduler_config = MagicMock()
+        scheduler_config.name = None
+        scheduler_config.warmup_steps = 100
+        scheduler_config.scheduler_kwargs = None
+
+        with self.assertRaises(ValueError) as context:
+            setup_scheduler(self.optimizer, scheduler_config, num_training_steps=1000)
+
+        self.assertIn("name must be specified", str(context.exception))
+
+    def test_scheduler_step_updates_lr(self):
+        """Test that scheduler step updates learning rate."""
+        scheduler_config = MagicMock()
+        scheduler_config.name = "linear"
+        scheduler_config.warmup_steps = 10
+        scheduler_config.scheduler_kwargs = None
+
+        scheduler = setup_scheduler(self.optimizer, scheduler_config, num_training_steps=100)
+
+        initial_lr = self.optimizer.param_groups[0]["lr"]
+        for _ in range(50):
+            scheduler.step()
+        final_lr = self.optimizer.param_groups[0]["lr"]
+
+        self.assertNotEqual(initial_lr, final_lr)
+
+    def test_warmup_phase(self):
+        """Test that warmup phase increases lr."""
+        scheduler_config = MagicMock()
+        scheduler_config.name = "linear"
+        scheduler_config.warmup_steps = 100
+        scheduler_config.scheduler_kwargs = None
+
+        scheduler = setup_scheduler(self.optimizer, scheduler_config, num_training_steps=1000)
+
+        lrs = []
+        for _ in range(50):
+            lrs.append(self.optimizer.param_groups[0]["lr"])
+            scheduler.step()
+
+        # During warmup, LR should generally increase
+        self.assertLess(lrs[0], lrs[-1])
+
+
+class TestSetupOptimizerAndScheduler(unittest.TestCase):
+    """Test setup_optimizer_and_scheduler function."""
+
+    def setUp(self):
+        self.model = SimpleModel()
+
+    def _make_train_config(self, freeze_patterns=None, keep_patterns=None):
+        """Helper to create a mock TrainConfig."""
+        train_config = MagicMock()
+        # System config
+        train_config.system = MagicMock()
+        train_config.system.optimizer = MagicMock()
+        train_config.system.optimizer.name = "AdamW"
+        train_config.system.optimizer.lr = 1e-4
+        train_config.system.optimizer.param_groups = None
+        train_config.system.optimizer.get_optimizer_kwargs.return_value = {"lr": 1e-4}
+        train_config.system.scheduler = MagicMock()
+        train_config.system.scheduler.name = "cosine"
+        train_config.system.scheduler.warmup_steps = 100
+        train_config.system.scheduler.scheduler_kwargs = None
+        train_config.system.train_steps = 1000
+        # Model config with freeze
+        train_config.model = MagicMock()
+        if freeze_patterns is not None:
+            train_config.model.freeze = MagicMock()
+            train_config.model.freeze.freeze_patterns = freeze_patterns
+            train_config.model.freeze.keep_patterns = keep_patterns
+        else:
+            train_config.model.freeze = None
+        return train_config
+
+    def test_returns_optimizer_and_scheduler(self):
+        """Test that function returns both optimizer and scheduler."""
+        train_config = self._make_train_config()
+
+        optimizer, scheduler = setup_optimizer_and_scheduler(self.model, train_config)
+
+        self.assertIsInstance(optimizer, torch.optim.AdamW)
+        self.assertIsNotNone(scheduler)
+        self.assertTrue(hasattr(scheduler, "step"))
+
+    def test_with_freeze_config(self):
+        """Test with freeze config applied."""
+        train_config = self._make_train_config(freeze_patterns=["encoder\\..*"])
+        train_config.system.scheduler.name = "linear"
+        train_config.system.scheduler.warmup_steps = 50
+        train_config.system.train_steps = 500
+
+        optimizer, scheduler = setup_optimizer_and_scheduler(self.model, train_config)
+
+        # Encoder should be frozen
+        for name, param in self.model.named_parameters():
+            if name.startswith("encoder"):
+                self.assertFalse(param.requires_grad)
+            else:
+                self.assertTrue(param.requires_grad)
+
+        self.assertIsInstance(optimizer, torch.optim.AdamW)
+        self.assertIsNotNone(scheduler)
+
+    def test_scheduler_uses_train_steps(self):
+        """Test that scheduler uses train_steps from TrainConfig."""
+        train_config = self._make_train_config()
+        train_config.system.scheduler.name = "linear"
+        train_config.system.scheduler.warmup_steps = 10
+        train_config.system.train_steps = 100
+
+        optimizer, scheduler = setup_optimizer_and_scheduler(self.model, train_config)
+
+        # Step through warmup first
+        for _ in range(15):
+            optimizer.step()
+            scheduler.step()
+        peak_lr = optimizer.param_groups[0]["lr"]
+
+        # Step through decay phase
+        for _ in range(80):
+            optimizer.step()
+            scheduler.step()
+        final_lr = optimizer.param_groups[0]["lr"]
+
+        # After decay, LR should be less than peak
+        self.assertLess(final_lr, peak_lr)