From 0d92a2b8894299ad621b61c09c00544ba65dcd4c Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:41 -0500
Subject: [PATCH 01/15] add ll distributed script across 4 nodes

---
 ep/bench/launch_distributed_ll.sh | 261 ++++++++++++++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100755 ep/bench/launch_distributed_ll.sh

diff --git a/ep/bench/launch_distributed_ll.sh b/ep/bench/launch_distributed_ll.sh
new file mode 100755
index 000000000..d29146634
--- /dev/null
+++ b/ep/bench/launch_distributed_ll.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+#
+# Launch Low Latency Test Script (Distributed)
+#
+# Launches test_low_latency.py across multiple nodes via SSH
+#
+# Usage:
+#   ./launch_distributed.sh          # Launch the test
+#   ./launch_distributed.sh kill     # Kill lam conda env GPU processes only
+#
+# Environment variables:
+#   MASTER_PORT   - Master port (default: random 29500-30499)
+#   CONDA_ENV     - Conda environment to use (default: lam)
+#   NUM_TOKENS    - Number of tokens (default: 128)
+#   HIDDEN        - Hidden size (default: 7168)
+#   NUM_TOPK      - Top-k value (default: 8)
+#   NUM_EXPERTS   - Number of experts (default: 288)
+#   PER_RANK_LOG  - Enable per-GPU/rank logging (default: 1, set to 0 to disable)
+#
+# Example:
+#   CONDA_ENV=lam NUM_TOKENS=256 ./launch_distributed.sh
+#   PER_RANK_LOG=0 ./launch_distributed.sh   # disable per-GPU logs (node-level only)
+#
+
+# Configuration
+MASTER_ADDR="10.162.224.129"
+MASTER_PORT="${MASTER_PORT:-$((29500 + RANDOM % 1000))}"  # Random port 29500-30499 if not specified
+NNODES=4
+NPROC_PER_NODE=8
+
+# Node IPs (in order of node_rank)
+NODES=(
+    "10.162.224.129"
+    "10.162.224.132"
+    "10.162.224.133"
+    "10.162.224.134"
+)
+
+# Kill command - only kills processes running under the lam conda environment
+# Two-stage: first shows PIDs, then asks for confirmation
+if [ "$1" == "kill" ]; then
+    echo "Scanning for lam conda env GPU processes on all nodes..."
+    LAM_PYTHON_PATH="/home/yangzhou/miniconda3/envs/lam/bin/python"
+    
+    # Declare associative array to store PIDs per node
+    declare -A NODE_PIDS
+    TOTAL_PIDS=0
+    
+    # Stage 1: Collect PIDs from all nodes
+    for node_ip in "${NODES[@]}"; do
+        echo "  Checking ${node_ip}..."
+        if [ "${node_ip}" == "${MASTER_ADDR}" ]; then
+            pids=$(sudo amd-smi process 2>/dev/null | grep -A1 "NAME: ${LAM_PYTHON_PATH}" | grep 'PID:' | awk '{print $2}' | sort -u | tr '\n' ' ')
+        else
+            pids=$(ssh -o StrictHostKeyChecking=no "${node_ip}" "sudo amd-smi process 2>/dev/null | grep -A1 'NAME: ${LAM_PYTHON_PATH}' | grep 'PID:' | awk '{print \$2}' | sort -u | tr '\n' ' '" 2>/dev/null)
+        fi
+        pids=$(echo "$pids" | xargs)  # trim whitespace
+        if [ -n "$pids" ]; then
+            NODE_PIDS["$node_ip"]="$pids"
+            pid_count=$(echo "$pids" | wc -w)
+            TOTAL_PIDS=$((TOTAL_PIDS + pid_count))
+            echo "    Found PIDs: $pids"
+        else
+            echo "    No lam processes found"
+        fi
+    done
+    
+    # Check if any PIDs found
+    if [ "$TOTAL_PIDS" -eq 0 ]; then
+        echo ""
+        echo "No lam processes found on any node."
+        exit 0
+    fi
+    
+    # Stage 2: Ask for confirmation
+    echo ""
+    echo "=========================================="
+    echo "Summary: $TOTAL_PIDS process(es) to kill"
+    echo "=========================================="
+    for node_ip in "${!NODE_PIDS[@]}"; do
+        echo "  ${node_ip}: ${NODE_PIDS[$node_ip]}"
+    done
+    echo ""
+    read -p "Kill these processes? [y/N]: " confirm
+    
+    if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then
+        echo "Aborted."
+        exit 0
+    fi
+    
+    # Stage 3: Kill confirmed PIDs
+    echo ""
+    echo "Killing processes..."
+    for node_ip in "${!NODE_PIDS[@]}"; do
+        pids="${NODE_PIDS[$node_ip]}"
+        echo "  Killing on ${node_ip}: $pids"
+        if [ "${node_ip}" == "${MASTER_ADDR}" ]; then
+            echo "$pids" | xargs -r kill -9 2>/dev/null
+        else
+            ssh -o StrictHostKeyChecking=no "${node_ip}" "echo '$pids' | xargs -r kill -9" 2>/dev/null
+        fi
+    done
+    echo "Done."
+    exit 0
+fi
+
+# Conda configuration
+CONDA_PATH="/home/yangzhou/miniconda3"
+CONDA_ENV="${CONDA_ENV:-lam}"  # Can override with CONDA_ENV=myenv
+
+# Default benchmark parameters
+NUM_TOKENS="${NUM_TOKENS:-128}"
+HIDDEN="${HIDDEN:-7168}"
+NUM_TOPK="${NUM_TOPK:-8}"
+NUM_EXPERTS="${NUM_EXPERTS:-288}"
+
+# Log directory (shared filesystem)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+LOG_DIR="${SCRIPT_DIR}/logs"
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+RUN_LOG_DIR="${LOG_DIR}/${TIMESTAMP}"
+
+# Create log directory
+mkdir -p "${RUN_LOG_DIR}"
+
+echo "=============================================="
+echo "Launching distributed test_low_latency.py"
+echo "=============================================="
+echo "Master: ${MASTER_ADDR}:${MASTER_PORT}"
+echo "Nodes: ${NNODES}, GPUs per node: ${NPROC_PER_NODE}"
+echo "Conda env: ${CONDA_ENV}"
+echo "Parameters: tokens=${NUM_TOKENS}, hidden=${HIDDEN}, topk=${NUM_TOPK}, experts=${NUM_EXPERTS}"
+echo "Logs: ${RUN_LOG_DIR}"
+if [ "${PER_RANK_LOG:-1}" == "1" ]; then
+    echo "Per-rank logging: ENABLED (logs in node_X/local_rank_Y/)"
+else
+    echo "Per-rank logging: disabled"
+fi
+echo "=============================================="
+
+# Function to launch on a node
+launch_node() {
+    local node_rank=$1
+    local node_ip=${NODES[$node_rank]}
+    local log_file="${RUN_LOG_DIR}/node_${node_rank}_${node_ip}.log"
+    
+    echo "[$(date '+%H:%M:%S')] Launching on node ${node_rank} (${node_ip})..."
+    
+    # Build the command using explicit Python path (more reliable than conda activate)
+    local PYTHON_BIN="${CONDA_PATH}/envs/${CONDA_ENV}/bin/python"
+    
+    # Per-rank logging: use torchrun's --log-dir and --tee options
+    local LOG_OPTS=""
+    if [ "${PER_RANK_LOG:-1}" == "1" ]; then
+        # Create per-rank log directory for this node
+        local RANK_LOG_DIR="${RUN_LOG_DIR}/node_${node_rank}"
+        mkdir -p "${RANK_LOG_DIR}"
+        # --tee 3 = tee both stdout(1) and stderr(2) to log files
+        # --log-dir = directory where per-rank logs will be written
+        LOG_OPTS="--log-dir ${RANK_LOG_DIR} --tee 3"
+    fi
+    
+    local torchrun_cmd="${PYTHON_BIN} -m torch.distributed.run \
+        --nnodes=${NNODES} \
+        --nproc_per_node=${NPROC_PER_NODE} \
+        --node_rank=${node_rank} \
+        --master_addr=${MASTER_ADDR} \
+        --master_port=${MASTER_PORT} \
+        ${LOG_OPTS} \
+        bench/test_low_latency.py \
+        --num-tokens=${NUM_TOKENS} \
+        --hidden=${HIDDEN} \
+        --num-topk=${NUM_TOPK} \
+        --single-run \
+        --num-experts=${NUM_EXPERTS}" 
+    
+    # Set required environment variables for network interfaces
+    local env_vars="export NCCL_SOCKET_IFNAME=enp49s0f1np1 && \
+export GLOO_SOCKET_IFNAME=enp49s0f1np1 && \
+export UCCL_IB_GID_INDEX=3 && \
+export NCCL_IB_GID_INDEX=3"
+    
+    local cmd="${env_vars} && cd ${SCRIPT_DIR}/.. && ${torchrun_cmd}"
+    
+    # Write command to log file first
+    {
+        echo "=============================================="
+        echo "Command: ${torchrun_cmd}"
+        echo "Node: ${node_rank} (${node_ip})"
+        echo "Time: $(date)"
+        echo "Environment:"
+        echo "  NCCL_SOCKET_IFNAME=enp49s0f1np1"
+        echo "  GLOO_SOCKET_IFNAME=enp49s0f1np1"
+        echo "  UCCL_IB_GID_INDEX=3"
+        echo "  NCCL_IB_GID_INDEX=3"
+        echo "=============================================="
+        echo ""
+    } > "${log_file}"
+    
+    if [ "${node_ip}" == "${MASTER_ADDR}" ] && [ "${node_rank}" -eq 0 ]; then
+        # Local execution for master node
+        nohup bash -c "${cmd}" >> "${log_file}" 2>&1 &
+    else
+        # SSH to remote node
+        # Write output to shared filesystem log file on remote side
+        # Use nohup so process survives SSH disconnect
+        ssh -o StrictHostKeyChecking=no \
+            -o ServerAliveInterval=30 \
+            -o ServerAliveCountMax=10 \
+            "${node_ip}" "nohup bash -c '${cmd}' >> ${log_file} 2>&1 &" &
+    fi
+    
+    echo "[$(date '+%H:%M:%S')] Node ${node_rank} launched, PID: $!, log: ${log_file}"
+}
+
+# Launch all nodes
+PIDS=()
+for rank in $(seq 0 $((NNODES - 1))); do
+    launch_node ${rank}
+    PIDS+=($!)
+done
+
+echo ""
+echo "All nodes launched. PIDs: ${PIDS[*]}"
+echo "Waiting for completion..."
+echo ""
+
+# Wait for all processes and collect exit codes
+EXIT_CODES=()
+for i in "${!PIDS[@]}"; do
+    wait ${PIDS[$i]}
+    EXIT_CODES+=($?)
+done
+
+# Print summary
+echo ""
+echo "=============================================="
+echo "Execution completed"
+echo "=============================================="
+for i in "${!EXIT_CODES[@]}"; do
+    status="SUCCESS"
+    if [ ${EXIT_CODES[$i]} -ne 0 ]; then
+        status="FAILED (exit code: ${EXIT_CODES[$i]})"
+    fi
+    echo "Node ${i} (${NODES[$i]}): ${status}"
+done
+
+# Create a combined log file
+COMBINED_LOG="${RUN_LOG_DIR}/combined.log"
+echo "Creating combined log: ${COMBINED_LOG}"
+for rank in $(seq 0 $((NNODES - 1))); do
+    echo "" >> "${COMBINED_LOG}"
+    echo "========== Node ${rank} (${NODES[$rank]}) ==========" >> "${COMBINED_LOG}"
+    echo "" >> "${COMBINED_LOG}"
+    cat "${RUN_LOG_DIR}/node_${rank}_${NODES[$rank]}.log" >> "${COMBINED_LOG}"
+done
+
+echo ""
+echo "Logs saved to: ${RUN_LOG_DIR}"
+echo "  - Individual logs: node_X_<ip>.log"
+echo "  - Combined log: combined.log"

From 07962317e5a32413b273c300807e5c9fafcbabc0 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:41 -0500
Subject: [PATCH 02/15] Add debug logging support

---
 ep/bench/buffer.py                | 34 +++++++++++++++++++++++++++++++
 ep/bench/launch_distributed_ll.sh |  5 ++---
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/ep/bench/buffer.py b/ep/bench/buffer.py
index b49a528f9..e2e5c65a1 100644
--- a/ep/bench/buffer.py
+++ b/ep/bench/buffer.py
@@ -281,6 +281,24 @@ def low_latency_dispatch(
                 hidden=x.shape[1],
                 num_experts=num_experts,
             )
+        
+        # DEBUG: Print data before CUDA kernel launch
+        if os.environ.get("DEBUG_DISPATCH", "0") == "1":
+            print(f"\n{'='*60}", flush=True)
+            print(f"[DEBUG] low_latency_dispatch - BEFORE CUDA kernel", flush=True)
+            print(f"{'='*60}", flush=True)
+            print(f"  rank: {self.rank}", flush=True)
+            print(f"  x.shape: {x.shape}, dtype: {x.dtype}", flush=True)
+            print(f"  x.data_ptr: {hex(x.data_ptr())}", flush=True)
+            print(f"  x[:3, :8]:\n{x[:3, :8]}", flush=True)
+            print(f"  topk_idx.shape: {topk_idx.shape}, dtype: {topk_idx.dtype}", flush=True)
+            print(f"  topk_idx[:5]:\n{topk_idx[:5]}", flush=True)
+            print(f"  num_max_dispatch_tokens_per_rank: {num_max_dispatch_tokens_per_rank}", flush=True)
+            print(f"  num_experts: {num_experts}", flush=True)
+            print(f"  use_fp8: {use_fp8}, round_scale: {round_scale}, use_ue8m0: {use_ue8m0}", flush=True)
+            print(f"  async_finish: {async_finish}, return_recv_hook: {return_recv_hook}", flush=True)
+            print(f"{'='*60}\n", flush=True)
+        
         (
             packed_recv_x,
             packed_recv_x_scales,
@@ -309,6 +327,22 @@ def low_latency_dispatch(
             x.size(1),
             num_experts,
         )
+        
+        # DEBUG: Print data after CUDA kernel returns
+        if os.environ.get("DEBUG_DISPATCH", "0") == "1":
+            import torch
+            torch.cuda.synchronize()  # Wait for kernel to complete
+            print(f"\n{'='*60}", flush=True)
+            print(f"[DEBUG] low_latency_dispatch - AFTER CUDA kernel", flush=True)
+            print(f"{'='*60}", flush=True)
+            print(f"  packed_recv_x.shape: {packed_recv_x.shape}, dtype: {packed_recv_x.dtype}", flush=True)
+            print(f"  packed_recv_count: {packed_recv_count}", flush=True)
+            print(f"  packed_recv_src_info.shape: {packed_recv_src_info.shape}", flush=True)
+            print(f"  packed_recv_layout_range.shape: {packed_recv_layout_range.shape}", flush=True)
+            if packed_recv_x_scales is not None:
+                print(f"  packed_recv_x_scales.shape: {packed_recv_x_scales.shape}", flush=True)
+            print(f"{'='*60}\n", flush=True)
+        
         tensors_to_record = (
             x,
             topk_idx,
diff --git a/ep/bench/launch_distributed_ll.sh b/ep/bench/launch_distributed_ll.sh
index d29146634..ce8804e2b 100755
--- a/ep/bench/launch_distributed_ll.sh
+++ b/ep/bench/launch_distributed_ll.sh
@@ -171,15 +171,14 @@ launch_node() {
         --num-tokens=${NUM_TOKENS} \
         --hidden=${HIDDEN} \
         --num-topk=${NUM_TOPK} \
-        --single-run \
         --num-experts=${NUM_EXPERTS}" 
     
     # Set required environment variables for network interfaces
     local env_vars="export NCCL_SOCKET_IFNAME=enp49s0f1np1 && \
 export GLOO_SOCKET_IFNAME=enp49s0f1np1 && \
 export UCCL_IB_GID_INDEX=3 && \
-export NCCL_IB_GID_INDEX=3"
-    
+export NCCL_IB_GID_INDEX=3 && \
+export DEBUG_DISPATCH=1"
     local cmd="${env_vars} && cd ${SCRIPT_DIR}/.. && ${torchrun_cmd}"
     
     # Write command to log file first

From 4a15e8ca85485fba9e211adc47d1adcfb9c774b6 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:42 -0500
Subject: [PATCH 03/15] containerize env and launching script

---
 .devcontainer/devcontainer.json   | 53 ++++++++++++++++++
 .gitignore                        |  1 +
 ep/bench/launch_distributed_ll.sh | 93 +++++++++++++++----------------
 3 files changed, 98 insertions(+), 49 deletions(-)
 create mode 100644 .devcontainer/devcontainer.json

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..6345310e8
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,53 @@
+{
+    "name": "lam_rocm",
+    "image": "docker.io/rocm/megatron-lm:v25.9_gfx950",
+    
+    "remoteUser": "root",
+    
+    // Container settings
+    "runArgs": [
+        "--ipc=host",
+        "--network=host",
+        "--shm-size=64GB",
+        "--cap-add=SYS_PTRACE",
+        "--cap-add=CAP_SYS_ADMIN",
+        "--cap-add=IPC_LOCK",
+        "--security-opt=seccomp=unconfined",
+        "--group-add=video",
+        "--privileged",
+        "--device=/dev/kfd",
+        "--device=/dev/dri",
+        "--device=/dev/infiniband",
+        "--ulimit=memlock=-1:-1",
+        "--ulimit=stack=67108864:67108864"
+    ],
+    
+    // Volume mounts
+    "mounts": [
+        "source=${localEnv:HOME}/.ssh,target=/root/.ssh,type=bind,readonly",
+        "source=/sys/class/infiniband,target=/sys/class/infiniband,type=bind",
+        "source=/dev/hugepages,target=/dev/hugepages,type=bind",
+        "source=/usr/lib/x86_64-linux-gnu/libionic.so.1.0.54.0-164.g21c72dcad,target=/usr/lib/x86_64-linux-gnu/libionic.so.1.0.54.0-164.g21c72dcad,type=bind",
+        "source=/usr/lib/x86_64-linux-gnu/libionic.so.1,target=/usr/lib/x86_64-linux-gnu/libionic.so.1,type=bind",
+        "source=/usr/lib/x86_64-linux-gnu/libionic.so,target=/usr/lib/x86_64-linux-gnu/libionic.so,type=bind",
+        "source=/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so,target=/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so,type=bind",
+        "source=/etc/libibverbs.d/ionic.driver,target=/etc/libibverbs.d/ionic.driver,type=bind"
+    ],
+    
+    // Working directory inside container
+    "workspaceFolder": "/io",
+    "workspaceMount": "source=${localWorkspaceFolder},target=/io,type=bind",
+    
+    // Post-create command to install dependencies
+    "postCreateCommand": "apt-get update && apt-get install -y libibverbs-dev libgoogle-glog-dev && cd ep && ./install_deps.sh && python setup.py install",
+    
+    // Optional: VS Code / Cursor extensions to install in container
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.vscode-pylance"
+            ]
+        }
+    }
+}
diff --git a/.gitignore b/.gitignore
index c24ecf89b..506ef150c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,4 +100,5 @@ ep/figs
 
 ep/deep_ep_wrapper/deep_ep.egg-info/
 *.json
+!.devcontainer/devcontainer.json
 *result.jsonl
\ No newline at end of file
diff --git a/ep/bench/launch_distributed_ll.sh b/ep/bench/launch_distributed_ll.sh
index ce8804e2b..920685992 100755
--- a/ep/bench/launch_distributed_ll.sh
+++ b/ep/bench/launch_distributed_ll.sh
@@ -3,14 +3,15 @@
 # Launch Low Latency Test Script (Distributed)
 #
 # Launches test_low_latency.py across multiple nodes via SSH
+# Executes inside Docker container "lam_rocm" on each node
 #
 # Usage:
 #   ./launch_distributed.sh          # Launch the test
-#   ./launch_distributed.sh kill     # Kill lam conda env GPU processes only
+#   ./launch_distributed.sh kill     # Kill GPU processes in lam_rocm container
 #
 # Environment variables:
 #   MASTER_PORT   - Master port (default: random 29500-30499)
-#   CONDA_ENV     - Conda environment to use (default: lam)
+#   CONTAINER     - Docker container name (default: lam_rocm)
 #   NUM_TOKENS    - Number of tokens (default: 128)
 #   HIDDEN        - Hidden size (default: 7168)
 #   NUM_TOPK      - Top-k value (default: 8)
@@ -18,41 +19,43 @@
 #   PER_RANK_LOG  - Enable per-GPU/rank logging (default: 1, set to 0 to disable)
 #
 # Example:
-#   CONDA_ENV=lam NUM_TOKENS=256 ./launch_distributed.sh
+#   CONTAINER=lam_rocm NUM_TOKENS=256 ./launch_distributed.sh
 #   PER_RANK_LOG=0 ./launch_distributed.sh   # disable per-GPU logs (node-level only)
 #
 
 # Configuration
-MASTER_ADDR="10.162.224.129"
+MASTER_ADDR="chi2762"
 MASTER_PORT="${MASTER_PORT:-$((29500 + RANDOM % 1000))}"  # Random port 29500-30499 if not specified
 NNODES=4
 NPROC_PER_NODE=8
 
 # Node IPs (in order of node_rank)
 NODES=(
-    "10.162.224.129"
-    "10.162.224.132"
-    "10.162.224.133"
-    "10.162.224.134"
+    "chi2762" 
+    "chi2766" 
+    "chi2877" 
+    "chi2888"
 )
 
-# Kill command - only kills processes running under the lam conda environment
+# Docker container name
+CONTAINER="${CONTAINER:-lam_rocm}"
+
+# Kill command - kills python processes inside the lam_rocm container
 # Two-stage: first shows PIDs, then asks for confirmation
 if [ "$1" == "kill" ]; then
-    echo "Scanning for lam conda env GPU processes on all nodes..."
-    LAM_PYTHON_PATH="/home/yangzhou/miniconda3/envs/lam/bin/python"
+    echo "Scanning for python processes in ${CONTAINER} container on all nodes..."
     
     # Declare associative array to store PIDs per node
     declare -A NODE_PIDS
     TOTAL_PIDS=0
     
-    # Stage 1: Collect PIDs from all nodes
+    # Stage 1: Collect PIDs from all nodes (inside docker containers)
     for node_ip in "${NODES[@]}"; do
         echo "  Checking ${node_ip}..."
         if [ "${node_ip}" == "${MASTER_ADDR}" ]; then
-            pids=$(sudo amd-smi process 2>/dev/null | grep -A1 "NAME: ${LAM_PYTHON_PATH}" | grep 'PID:' | awk '{print $2}' | sort -u | tr '\n' ' ')
+            pids=$(docker exec ${CONTAINER} pgrep -f "python.*torch" 2>/dev/null | tr '\n' ' ')
         else
-            pids=$(ssh -o StrictHostKeyChecking=no "${node_ip}" "sudo amd-smi process 2>/dev/null | grep -A1 'NAME: ${LAM_PYTHON_PATH}' | grep 'PID:' | awk '{print \$2}' | sort -u | tr '\n' ' '" 2>/dev/null)
+            pids=$(ssh -o StrictHostKeyChecking=no "${node_ip}" "docker exec ${CONTAINER} pgrep -f 'python.*torch'" 2>/dev/null | tr '\n' ' ')
         fi
         pids=$(echo "$pids" | xargs)  # trim whitespace
         if [ -n "$pids" ]; then
@@ -61,14 +64,14 @@ if [ "$1" == "kill" ]; then
             TOTAL_PIDS=$((TOTAL_PIDS + pid_count))
             echo "    Found PIDs: $pids"
         else
-            echo "    No lam processes found"
+            echo "    No torch processes found in container"
         fi
     done
     
     # Check if any PIDs found
     if [ "$TOTAL_PIDS" -eq 0 ]; then
         echo ""
-        echo "No lam processes found on any node."
+        echo "No torch processes found in ${CONTAINER} container on any node."
         exit 0
     fi
     
@@ -88,26 +91,24 @@ if [ "$1" == "kill" ]; then
         exit 0
     fi
     
-    # Stage 3: Kill confirmed PIDs
+    # Stage 3: Kill confirmed PIDs (inside docker containers)
     echo ""
     echo "Killing processes..."
     for node_ip in "${!NODE_PIDS[@]}"; do
         pids="${NODE_PIDS[$node_ip]}"
         echo "  Killing on ${node_ip}: $pids"
         if [ "${node_ip}" == "${MASTER_ADDR}" ]; then
-            echo "$pids" | xargs -r kill -9 2>/dev/null
+            for pid in $pids; do
+                docker exec ${CONTAINER} kill -9 $pid 2>/dev/null
+            done
         else
-            ssh -o StrictHostKeyChecking=no "${node_ip}" "echo '$pids' | xargs -r kill -9" 2>/dev/null
+            ssh -o StrictHostKeyChecking=no "${node_ip}" "for pid in $pids; do docker exec ${CONTAINER} kill -9 \$pid 2>/dev/null; done"
         fi
     done
     echo "Done."
     exit 0
 fi
 
-# Conda configuration
-CONDA_PATH="/home/yangzhou/miniconda3"
-CONDA_ENV="${CONDA_ENV:-lam}"  # Can override with CONDA_ENV=myenv
-
 # Default benchmark parameters
 NUM_TOKENS="${NUM_TOKENS:-128}"
 HIDDEN="${HIDDEN:-7168}"
@@ -128,7 +129,7 @@ echo "Launching distributed test_low_latency.py"
 echo "=============================================="
 echo "Master: ${MASTER_ADDR}:${MASTER_PORT}"
 echo "Nodes: ${NNODES}, GPUs per node: ${NPROC_PER_NODE}"
-echo "Conda env: ${CONDA_ENV}"
+echo "Container: ${CONTAINER}"
 echo "Parameters: tokens=${NUM_TOKENS}, hidden=${HIDDEN}, topk=${NUM_TOPK}, experts=${NUM_EXPERTS}"
 echo "Logs: ${RUN_LOG_DIR}"
 if [ "${PER_RANK_LOG:-1}" == "1" ]; then
@@ -146,9 +147,6 @@ launch_node() {
     
     echo "[$(date '+%H:%M:%S')] Launching on node ${node_rank} (${node_ip})..."
     
-    # Build the command using explicit Python path (more reliable than conda activate)
-    local PYTHON_BIN="${CONDA_PATH}/envs/${CONDA_ENV}/bin/python"
-    
     # Per-rank logging: use torchrun's --log-dir and --tee options
     local LOG_OPTS=""
     if [ "${PER_RANK_LOG:-1}" == "1" ]; then
@@ -160,7 +158,7 @@ launch_node() {
         LOG_OPTS="--log-dir ${RANK_LOG_DIR} --tee 3"
     fi
     
-    local torchrun_cmd="${PYTHON_BIN} -m torch.distributed.run \
+    local torchrun_cmd="python -m torch.distributed.run \
         --nnodes=${NNODES} \
         --nproc_per_node=${NPROC_PER_NODE} \
         --node_rank=${node_rank} \
@@ -174,40 +172,37 @@ launch_node() {
         --num-experts=${NUM_EXPERTS}" 
     
     # Set required environment variables for network interfaces
-    local env_vars="export NCCL_SOCKET_IFNAME=enp49s0f1np1 && \
-export GLOO_SOCKET_IFNAME=enp49s0f1np1 && \
-export UCCL_IB_GID_INDEX=3 && \
-export NCCL_IB_GID_INDEX=3 && \
+    local env_vars="export NCCL_SOCKET_IFNAME=enp193s0f1np1 && \
+export GLOO_SOCKET_IFNAME=enp193s0f1np1 && \
+export UCCL_IB_GID_INDEX=1 && \
+export NCCL_IB_GID_INDEX=1 && \
 export DEBUG_DISPATCH=1"
-    local cmd="${env_vars} && cd ${SCRIPT_DIR}/.. && ${torchrun_cmd}"
+    
+    # Build the command to run inside docker container
+    local docker_cmd="docker exec ${CONTAINER} bash -c '${env_vars} && cd /io/ep && ${torchrun_cmd}'"
     
     # Write command to log file first
     {
         echo "=============================================="
+        echo "Container: ${CONTAINER}"
         echo "Command: ${torchrun_cmd}"
         echo "Node: ${node_rank} (${node_ip})"
         echo "Time: $(date)"
         echo "Environment:"
-        echo "  NCCL_SOCKET_IFNAME=enp49s0f1np1"
-        echo "  GLOO_SOCKET_IFNAME=enp49s0f1np1"
-        echo "  UCCL_IB_GID_INDEX=3"
-        echo "  NCCL_IB_GID_INDEX=3"
+        echo "  NCCL_SOCKET_IFNAME=enp193s0f1np1"
+        echo "  GLOO_SOCKET_IFNAME=enp193s0f1np1"
+        echo "  UCCL_IB_GID_INDEX=1"
+        echo "  NCCL_IB_GID_INDEX=1"
         echo "=============================================="
         echo ""
     } > "${log_file}"
     
-    if [ "${node_ip}" == "${MASTER_ADDR}" ] && [ "${node_rank}" -eq 0 ]; then
-        # Local execution for master node
-        nohup bash -c "${cmd}" >> "${log_file}" 2>&1 &
-    else
-        # SSH to remote node
-        # Write output to shared filesystem log file on remote side
-        # Use nohup so process survives SSH disconnect
-        ssh -o StrictHostKeyChecking=no \
-            -o ServerAliveInterval=30 \
-            -o ServerAliveCountMax=10 \
-            "${node_ip}" "nohup bash -c '${cmd}' >> ${log_file} 2>&1 &" &
-    fi
+    # SSH to node (including master) and run inside docker container
+    # This ensures consistent behavior across all nodes
+    ssh -o StrictHostKeyChecking=no \
+        -o ServerAliveInterval=30 \
+        -o ServerAliveCountMax=10 \
+        "${node_ip}" "nohup ${docker_cmd} >> ${log_file} 2>&1 &" &
     
     echo "[$(date '+%H:%M:%S')] Node ${node_rank} launched, PID: $!, log: ${log_file}"
 }

From af616fc121291b8a1166b05f929745dd3740f708 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:42 -0500
Subject: [PATCH 04/15] change the test case to minimial setup to easy
 developing

---
 ep/bench/launch_distributed_ll.sh | 297 +++++++++++++++++-------------
 ep/bench/test_low_latency.py      |  99 +++++++++-
 2 files changed, 265 insertions(+), 131 deletions(-)

diff --git a/ep/bench/launch_distributed_ll.sh b/ep/bench/launch_distributed_ll.sh
index 920685992..3138536c5 100755
--- a/ep/bench/launch_distributed_ll.sh
+++ b/ep/bench/launch_distributed_ll.sh
@@ -3,59 +3,86 @@
 # Launch Low Latency Test Script (Distributed)
 #
 # Launches test_low_latency.py across multiple nodes via SSH
-# Executes inside Docker container "lam_rocm" on each node
+# Supports both Docker and Conda execution modes
 #
 # Usage:
-#   ./launch_distributed.sh          # Launch the test
-#   ./launch_distributed.sh kill     # Kill GPU processes in lam_rocm container
+#   ./launch_distributed_ll.sh [--docker | --conda]   # Launch the test (default: conda)
+#   ./launch_distributed_ll.sh kill                   # Kill running processes
+#
+# Options:
+#   --docker    Run inside Docker container (default container: lam_rocm)
+#   --conda     Run inside conda environment (default: /home/ubuntu/lam/conda_lam_local)
 #
 # Environment variables:
 #   MASTER_PORT   - Master port (default: random 29500-30499)
-#   CONTAINER     - Docker container name (default: lam_rocm)
+#   CONTAINER     - Docker container name (default: lam_rocm, only for --docker)
+#   CONDA_ENV     - Conda environment path (default: /home/ubuntu/lam/conda_lam_local, only for --conda)
 #   NUM_TOKENS    - Number of tokens (default: 128)
 #   HIDDEN        - Hidden size (default: 7168)
 #   NUM_TOPK      - Top-k value (default: 8)
 #   NUM_EXPERTS   - Number of experts (default: 288)
-#   PER_RANK_LOG  - Enable per-GPU/rank logging (default: 1, set to 0 to disable)
 #
-# Example:
-#   CONTAINER=lam_rocm NUM_TOKENS=256 ./launch_distributed.sh
-#   PER_RANK_LOG=0 ./launch_distributed.sh   # disable per-GPU logs (node-level only)
+# This script passes --stop-after-first to test_low_latency.py so only the first
+# experiment (return_recv_hook=False, dispatch_use_fp8=False, ...) is run.
+#
+# Examples:
+#   ./launch_distributed_ll.sh --conda
+#   ./launch_distributed_ll.sh --docker
+#   ./launch_distributed_ll.sh --conda NUM_TOKENS=256
+#   ./launch_distributed_ll.sh --docker CONTAINER=my_container kill
 #
 
-# Configuration
-MASTER_ADDR="chi2762"
-MASTER_PORT="${MASTER_PORT:-$((29500 + RANDOM % 1000))}"  # Random port 29500-30499 if not specified
-NNODES=4
-NPROC_PER_NODE=8
+# Parse command line options
+RUN_MODE="conda"  # default mode
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --docker)
+            RUN_MODE="docker"
+            shift
+            ;;
+        --conda)
+            RUN_MODE="conda"
+            shift
+            ;;
+        *)
+            break
+            ;;
+    esac
+done
 
 # Node IPs (in order of node_rank)
 NODES=(
-    "chi2762" 
-    "chi2766" 
-    "chi2877" 
-    "chi2888"
+    "172.31.24.178"
+    "172.31.20.184"
 )
 
+# Configuration - MASTER_ADDR automatically set to first node
+MASTER_ADDR="${NODES[0]}"
+MASTER_PORT="${MASTER_PORT:-$((29500 + RANDOM % 1000))}"  # Random port 29500-30499 if not specified
+NNODES=2
+NPROC_PER_NODE=8
+
 # Docker container name
 CONTAINER="${CONTAINER:-lam_rocm}"
 
-# Kill command - kills python processes inside the lam_rocm container
-# Two-stage: first shows PIDs, then asks for confirmation
+# Conda environment path
+CONDA_ENV="${CONDA_ENV:-/home/ubuntu/lam/conda_lam_local}"
+
+# Kill command - kills processes based on mode
 if [ "$1" == "kill" ]; then
-    echo "Scanning for python processes in ${CONTAINER} container on all nodes..."
-    
+    echo "Scanning for processes on all nodes (mode: ${RUN_MODE})..."
+
     # Declare associative array to store PIDs per node
     declare -A NODE_PIDS
     TOTAL_PIDS=0
-    
-    # Stage 1: Collect PIDs from all nodes (inside docker containers)
+
+    # Stage 1: Collect PIDs from all nodes
     for node_ip in "${NODES[@]}"; do
         echo "  Checking ${node_ip}..."
-        if [ "${node_ip}" == "${MASTER_ADDR}" ]; then
-            pids=$(docker exec ${CONTAINER} pgrep -f "python.*torch" 2>/dev/null | tr '\n' ' ')
-        else
+        if [ "${RUN_MODE}" == "docker" ]; then
             pids=$(ssh -o StrictHostKeyChecking=no "${node_ip}" "docker exec ${CONTAINER} pgrep -f 'python.*torch'" 2>/dev/null | tr '\n' ' ')
+        else
+            pids=$(ssh -o StrictHostKeyChecking=no "${node_ip}" "pgrep -f 'python.*test_low_latency'" 2>/dev/null | tr '\n' ' ')
         fi
         pids=$(echo "$pids" | xargs)  # trim whitespace
         if [ -n "$pids" ]; then
@@ -64,17 +91,17 @@ if [ "$1" == "kill" ]; then
             TOTAL_PIDS=$((TOTAL_PIDS + pid_count))
             echo "    Found PIDs: $pids"
         else
-            echo "    No torch processes found in container"
+            echo "    No processes found"
         fi
     done
-    
+
     # Check if any PIDs found
     if [ "$TOTAL_PIDS" -eq 0 ]; then
         echo ""
-        echo "No torch processes found in ${CONTAINER} container on any node."
+        echo "No processes found on any node."
         exit 0
     fi
-    
+
     # Stage 2: Ask for confirmation
     echo ""
     echo "=========================================="
@@ -85,24 +112,22 @@ if [ "$1" == "kill" ]; then
     done
     echo ""
     read -p "Kill these processes? [y/N]: " confirm
-    
+
     if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then
         echo "Aborted."
         exit 0
     fi
-    
-    # Stage 3: Kill confirmed PIDs (inside docker containers)
+
+    # Stage 3: Kill confirmed PIDs
     echo ""
     echo "Killing processes..."
     for node_ip in "${!NODE_PIDS[@]}"; do
         pids="${NODE_PIDS[$node_ip]}"
         echo "  Killing on ${node_ip}: $pids"
-        if [ "${node_ip}" == "${MASTER_ADDR}" ]; then
-            for pid in $pids; do
-                docker exec ${CONTAINER} kill -9 $pid 2>/dev/null
-            done
-        else
+        if [ "${RUN_MODE}" == "docker" ]; then
             ssh -o StrictHostKeyChecking=no "${node_ip}" "for pid in $pids; do docker exec ${CONTAINER} kill -9 \$pid 2>/dev/null; done"
+        else
+            ssh -o StrictHostKeyChecking=no "${node_ip}" "for pid in $pids; do kill -9 \$pid 2>/dev/null; done"
         fi
     done
     echo "Done."
@@ -118,138 +143,152 @@ NUM_EXPERTS="${NUM_EXPERTS:-288}"
 # Log directory (shared filesystem)
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 LOG_DIR="${SCRIPT_DIR}/logs"
-TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
-RUN_LOG_DIR="${LOG_DIR}/${TIMESTAMP}"
+RUN_LOG_DIR="${LOG_DIR}/latest"
 
 # Create log directory
 mkdir -p "${RUN_LOG_DIR}"
 
+# Clean old logs before starting
+echo "Cleaning old logs in ${RUN_LOG_DIR}..."
+rm -rf "${RUN_LOG_DIR:?}"/*
+
 echo "=============================================="
 echo "Launching distributed test_low_latency.py"
 echo "=============================================="
 echo "Master: ${MASTER_ADDR}:${MASTER_PORT}"
 echo "Nodes: ${NNODES}, GPUs per node: ${NPROC_PER_NODE}"
-echo "Container: ${CONTAINER}"
-echo "Parameters: tokens=${NUM_TOKENS}, hidden=${HIDDEN}, topk=${NUM_TOPK}, experts=${NUM_EXPERTS}"
-echo "Logs: ${RUN_LOG_DIR}"
-if [ "${PER_RANK_LOG:-1}" == "1" ]; then
-    echo "Per-rank logging: ENABLED (logs in node_X/local_rank_Y/)"
+echo "Total GPUs: $((NNODES * NPROC_PER_NODE))"
+echo "Mode: ${RUN_MODE}"
+if [ "${RUN_MODE}" == "docker" ]; then
+    echo "Container: ${CONTAINER}"
 else
-    echo "Per-rank logging: disabled"
+    echo "Conda env: ${CONDA_ENV}"
 fi
+echo "Parameters: tokens=${NUM_TOKENS}, hidden=${HIDDEN}, topk=${NUM_TOPK}, experts=${NUM_EXPERTS}"
+echo "Logs: ${RUN_LOG_DIR}/"
+echo "  rank{0..$((NNODES * NPROC_PER_NODE - 1))}.log + node{0..$((NNODES - 1))}.log"
 echo "=============================================="
 
 # Function to launch on a node
 launch_node() {
     local node_rank=$1
     local node_ip=${NODES[$node_rank]}
-    local log_file="${RUN_LOG_DIR}/node_${node_rank}_${node_ip}.log"
-    
+
     echo "[$(date '+%H:%M:%S')] Launching on node ${node_rank} (${node_ip})..."
-    
-    # Per-rank logging: use torchrun's --log-dir and --tee options
-    local LOG_OPTS=""
-    if [ "${PER_RANK_LOG:-1}" == "1" ]; then
-        # Create per-rank log directory for this node
-        local RANK_LOG_DIR="${RUN_LOG_DIR}/node_${node_rank}"
-        mkdir -p "${RANK_LOG_DIR}"
-        # --tee 3 = tee both stdout(1) and stderr(2) to log files
-        # --log-dir = directory where per-rank logs will be written
-        LOG_OPTS="--log-dir ${RANK_LOG_DIR} --tee 3"
-    fi
-    
-    local torchrun_cmd="python -m torch.distributed.run \
-        --nnodes=${NNODES} \
-        --nproc_per_node=${NPROC_PER_NODE} \
-        --node_rank=${node_rank} \
-        --master_addr=${MASTER_ADDR} \
-        --master_port=${MASTER_PORT} \
-        ${LOG_OPTS} \
-        bench/test_low_latency.py \
-        --num-tokens=${NUM_TOKENS} \
-        --hidden=${HIDDEN} \
-        --num-topk=${NUM_TOPK} \
-        --num-experts=${NUM_EXPERTS}" 
-    
-    # Set required environment variables for network interfaces
-    local env_vars="export NCCL_SOCKET_IFNAME=enp193s0f1np1 && \
+
+    # Build command based on mode
+    # Use </dev/null and redirect to file to ensure SSH returns immediately
+    # UCCL_LOG_DIR tells the Python script where to write rank{N}.log files
+    if [ "${RUN_MODE}" == "docker" ]; then
+        # Docker mode - include network environment variables
+        local env_vars="export NCCL_SOCKET_IFNAME=enp193s0f1np1 && \
 export GLOO_SOCKET_IFNAME=enp193s0f1np1 && \
 export UCCL_IB_GID_INDEX=1 && \
 export NCCL_IB_GID_INDEX=1 && \
-export DEBUG_DISPATCH=1"
-    
-    # Build the command to run inside docker container
-    local docker_cmd="docker exec ${CONTAINER} bash -c '${env_vars} && cd /io/ep && ${torchrun_cmd}'"
-    
-    # Write command to log file first
-    {
-        echo "=============================================="
-        echo "Container: ${CONTAINER}"
-        echo "Command: ${torchrun_cmd}"
-        echo "Node: ${node_rank} (${node_ip})"
-        echo "Time: $(date)"
-        echo "Environment:"
-        echo "  NCCL_SOCKET_IFNAME=enp193s0f1np1"
-        echo "  GLOO_SOCKET_IFNAME=enp193s0f1np1"
-        echo "  UCCL_IB_GID_INDEX=1"
-        echo "  NCCL_IB_GID_INDEX=1"
-        echo "=============================================="
-        echo ""
-    } > "${log_file}"
-    
-    # SSH to node (including master) and run inside docker container
-    # This ensures consistent behavior across all nodes
-    ssh -o StrictHostKeyChecking=no \
-        -o ServerAliveInterval=30 \
-        -o ServerAliveCountMax=10 \
-        "${node_ip}" "nohup ${docker_cmd} >> ${log_file} 2>&1 &" &
-    
-    echo "[$(date '+%H:%M:%S')] Node ${node_rank} launched, PID: $!, log: ${log_file}"
+export DEBUG_DISPATCH=1 && \
+export UCCL_LOG_DIR=${RUN_LOG_DIR}"
+        local torchrun_cmd="${CONDA_ENV}/bin/python -m torch.distributed.run \
+            --nnodes=${NNODES} \
+            --nproc_per_node=${NPROC_PER_NODE} \
+            --node_rank=${node_rank} \
+            --master_addr=${MASTER_ADDR} \
+            --master_port=${MASTER_PORT} \
+            ${SCRIPT_DIR}/test_low_latency.py \
+            --num-tokens=${NUM_TOKENS} \
+            --hidden=${HIDDEN} \
+            --num-topk=${NUM_TOPK} \
+            --num-experts=${NUM_EXPERTS} \
+            --stop-after-first"
+        local exec_cmd="docker exec ${CONTAINER} bash -c '${env_vars} && cd ${SCRIPT_DIR} && ${torchrun_cmd}'"
+        ssh -o StrictHostKeyChecking=no \
+            -o ServerAliveInterval=30 \
+            -o ServerAliveCountMax=10 \
+            "${node_ip}" "nohup ${exec_cmd} </dev/null >/dev/null 2>&1 &"
+    else
+        # Conda mode
+        local conda_cmd="export UCCL_LOG_DIR=${RUN_LOG_DIR} && \
+            cd ${SCRIPT_DIR} && ${CONDA_ENV}/bin/python -m torch.distributed.run \
+            --nnodes=${NNODES} \
+            --nproc_per_node=${NPROC_PER_NODE} \
+            --node_rank=${node_rank} \
+            --master_addr=${MASTER_ADDR} \
+            --master_port=${MASTER_PORT} \
+            ${SCRIPT_DIR}/test_low_latency.py \
+            --num-tokens=${NUM_TOKENS} \
+            --hidden=${HIDDEN} \
+            --num-topk=${NUM_TOPK} \
+            --num-experts=${NUM_EXPERTS} \
+            --stop-after-first"
+        ssh -o StrictHostKeyChecking=no \
+            -o ServerAliveInterval=30 \
+            -o ServerAliveCountMax=10 \
+            "${node_ip}" "nohup bash -c '${conda_cmd}' </dev/null >/dev/null 2>&1 &"
+    fi
+
+    echo "[$(date '+%H:%M:%S')] Node ${node_rank} launched"
 }
 
 # Launch all nodes
-PIDS=()
 for rank in $(seq 0 $((NNODES - 1))); do
     launch_node ${rank}
-    PIDS+=($!)
 done
 
 echo ""
-echo "All nodes launched. PIDs: ${PIDS[*]}"
-echo "Waiting for completion..."
+echo "All nodes launched."
+echo "Waiting for completion (checking marker files)..."
 echo ""
 
-# Wait for all processes and collect exit codes
-EXIT_CODES=()
-for i in "${!PIDS[@]}"; do
-    wait ${PIDS[$i]}
-    EXIT_CODES+=($?)
+TOTAL_GPUS=$((NNODES * NPROC_PER_NODE))
+
+# Poll until all ranks are done (check for .done.{rank} marker files)
+POLL_INTERVAL=3
+while true; do
+    done_count=0
+    for rank in $(seq 0 $((TOTAL_GPUS - 1))); do
+        if [ -f "${RUN_LOG_DIR}/.done.${rank}" ]; then
+            done_count=$((done_count + 1))
+        fi
+    done
+    
+    if [ "$done_count" -eq "$TOTAL_GPUS" ]; then
+        break
+    fi
+    
+    echo "[$(date '+%H:%M:%S')] Progress: ${done_count}/${TOTAL_GPUS} ranks done"
+    sleep ${POLL_INTERVAL}
 done
 
+# Clean up marker files
+rm -f "${RUN_LOG_DIR}"/.done.*
+
 # Print summary
 echo ""
 echo "=============================================="
 echo "Execution completed"
 echo "=============================================="
-for i in "${!EXIT_CODES[@]}"; do
-    status="SUCCESS"
-    if [ ${EXIT_CODES[$i]} -ne 0 ]; then
-        status="FAILED (exit code: ${EXIT_CODES[$i]})"
-    fi
-    echo "Node ${i} (${NODES[$i]}): ${status}"
+for i in $(seq 0 $((NNODES - 1))); do
+    echo "Node ${i} (${NODES[$i]}): DONE"
 done
 
-# Create a combined log file
-COMBINED_LOG="${RUN_LOG_DIR}/combined.log"
-echo "Creating combined log: ${COMBINED_LOG}"
-for rank in $(seq 0 $((NNODES - 1))); do
-    echo "" >> "${COMBINED_LOG}"
-    echo "========== Node ${rank} (${NODES[$rank]}) ==========" >> "${COMBINED_LOG}"
-    echo "" >> "${COMBINED_LOG}"
-    cat "${RUN_LOG_DIR}/node_${rank}_${NODES[$rank]}.log" >> "${COMBINED_LOG}"
+# Create per-node combined logs
+echo ""
+echo "Creating per-node combined logs..."
+for node_rank in $(seq 0 $((NNODES - 1))); do
+    node_log="${RUN_LOG_DIR}/node${node_rank}.log"
+    > "${node_log}"  # Clear/create file
+    for local_rank in $(seq 0 $((NPROC_PER_NODE - 1))); do
+        global_rank=$((node_rank * NPROC_PER_NODE + local_rank))
+        rank_log="${RUN_LOG_DIR}/rank${global_rank}.log"
+        if [ -f "${rank_log}" ]; then
+            echo "=== rank${global_rank} ===" >> "${node_log}"
+            cat "${rank_log}" >> "${node_log}"
+            echo "" >> "${node_log}"
+        fi
+    done
 done
 
 echo ""
 echo "Logs saved to: ${RUN_LOG_DIR}"
-echo "  - Individual logs: node_X_<ip>.log"
-echo "  - Combined log: combined.log"
+echo "  Per-rank: rank0.log ... rank$((TOTAL_GPUS - 1)).log (${TOTAL_GPUS} files)"
+echo "  Per-node: node0.log ... node$((NNODES - 1)).log (${NNODES} files)"
+echo "  Total: $((TOTAL_GPUS + NNODES)) files"
diff --git a/ep/bench/test_low_latency.py b/ep/bench/test_low_latency.py
index 45667be72..6498c6a6b 100644
--- a/ep/bench/test_low_latency.py
+++ b/ep/bench/test_low_latency.py
@@ -17,12 +17,72 @@
 import random
 import time
 import os
+import sys
 import torch
 import torch.distributed as dist
 import numpy as np
 from functools import partial
 from typing import Optional
 
+
+class TeeWriter:
+    """Write to multiple file handles simultaneously."""
+    def __init__(self, *files):
+        self.files = files
+        self._fileno = files[0].fileno() if files else None
+    
+    def write(self, data):
+        for f in self.files:
+            f.write(data)
+            f.flush()
+    
+    def flush(self):
+        for f in self.files:
+            f.flush()
+    
+    def fileno(self):
+        """Return fileno of the first file (needed for some redirections)."""
+        return self._fileno
+
+
+def setup_rank_logging():
+    """Redirect stdout/stderr to rank{N}.log if UCCL_LOG_DIR is set.
+    
+    This does fd-level redirection so C/CUDA code output is also captured.
+    """
+    log_dir = os.environ.get("UCCL_LOG_DIR")
+    if log_dir is None:
+        return
+    
+    rank = int(os.environ.get("RANK", 0))
+    
+    # Per-rank log file
+    rank_log = os.path.join(log_dir, f"rank{rank}.log")
+    rank_file = open(rank_log, "w", buffering=1)
+    
+    # Redirect fd 1 (stdout) and fd 2 (stderr) to the rank log file
+    # This captures C/CUDA printf output as well
+    os.dup2(rank_file.fileno(), 1)  # stdout
+    os.dup2(rank_file.fileno(), 2)  # stderr
+    
+    # Also update Python's sys.stdout/sys.stderr to use the file
+    sys.stdout = rank_file
+    sys.stderr = rank_file
+
+
+def mark_done():
+    """Create a marker file to indicate this rank is done."""
+    log_dir = os.environ.get("UCCL_LOG_DIR")
+    if log_dir is None:
+        return
+    rank = int(os.environ.get("RANK", 0))
+    marker_file = os.path.join(log_dir, f".done.{rank}")
+    open(marker_file, "w").close()
+
+
+# Setup logging as early as possible
+setup_rank_logging()
+
 from buffer import Buffer
 from utils import (
     init_dist,
@@ -88,6 +148,7 @@ def test_main(
     seed: int = 0,
     skip_benchmark: bool = False,
     debug_hash: bool = False,
+    stop_after_first: bool = False,
 ):
     torch.manual_seed(seed + rank)
     random.seed(seed + rank)
@@ -150,11 +211,22 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
             # Preserve the XOR aggregation behavior at per-label granularity.
             hash_details[label] = hash_details.get(label, 0) ^ hv
 
+    stop_now = False
     for current_x in x_list:
+        if stop_now:
+            break
         for return_recv_hook in (False, True):
+            if stop_now:
+                break
             for dispatch_use_fp8 in (False, True):
+                if stop_now:
+                    break
                 for round_scale in (False,):
+                    if stop_now:
+                        break
                     for round_scale in (False, True) if dispatch_use_fp8 else (False,):
+                        if stop_now:
+                            break
                         for use_ue8m0 in (False, True) if round_scale else (False,):
                             print(
                                 "Start experiment with settings:"
@@ -165,10 +237,12 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                 flush=True,
                             )
                             num_times += 1
-                            for i in range((num_times % 2) + 1):
+                            # for i in range((num_times % 2) + 1):
+                            for i in range(1):
                                 cumulative_local_expert_recv_stats = torch.zeros(
                                     (num_local_experts,), dtype=torch.int, device="cuda"
                                 )
+                                print(f"[python] dispatch called", flush=True)
                                 (
                                     packed_recv_x,
                                     packed_recv_count,
@@ -380,6 +454,17 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                         f"|logfmt={use_logfmt}"
                                     )
                                     _record_hash(f"combine_out|{tag}", combined_x)
+                            if stop_after_first:
+                                stop_now = True
+                                break
+                        if stop_now:
+                            break
+                    if stop_now:
+                        break
+                if stop_now:
+                    break
+            if stop_now:
+                break
 
     # noinspection PyShadowingNames
     def large_gemm_with_hook(hook):
@@ -413,7 +498,7 @@ def test_func(return_recv_hook: bool):
 
     print("✓ All correctness tests passed!", flush=True)
 
-    if skip_benchmark:
+    if skip_benchmark or stop_after_first:
         return (hash_value, hash_details) if debug_hash else hash_value
 
     # Calculate bandwidth
@@ -497,6 +582,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
             seed=seed,
             skip_benchmark=args.pressure_test_mode == 1,
             debug_hash=args.debug_hash,
+            stop_after_first=args.stop_after_first,
         )
         if args.debug_hash:
             ref_hash, ref_hash_details = ref_out
@@ -524,6 +610,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
                 seed=seed,
                 skip_benchmark=args.pressure_test_mode == 1,
                 debug_hash=args.debug_hash,
+                stop_after_first=args.stop_after_first,
             )
             if args.debug_hash:
                 current_hash, current_hash_details = cur_out
@@ -568,6 +655,9 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     buffer.destroy()
     dist.barrier()
     dist.destroy_process_group()
+    
+    # Mark this rank as done
+    mark_done()
 
 
 if __name__ == "__main__":
@@ -612,6 +702,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         action="store_true",
         help="Print per-tensor hash breakdown when non-determinism is detected.",
     )
+    parser.add_argument(
+        "--stop-after-first",
+        action="store_true",
+        help="Stop after the first experiment (return_recv_hook=False, dispatch_use_fp8=False, round_scale=False, use_ue8m0=False).",
+    )
     args = parser.parse_args()
 
     num_processes = args.num_processes

From 696d2915d61a4a097ff6440349073741a58d2441 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:43 -0500
Subject: [PATCH 05/15] add LAM_DEV flag in order to seperate the new and old
 code cleanly and avoid compile-related performance diff

---
 ep/.gitignore                |  3 ++-
 ep/Makefile                  | 27 ++++++++++++++++++++++++---
 ep/bench/test_low_latency.py | 11 ++++++++---
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/ep/.gitignore b/ep/.gitignore
index 12a320ce6..ec02ea881 100644
--- a/ep/.gitignore
+++ b/ep/.gitignore
@@ -1,4 +1,5 @@
 benchmark
 *.hip
 *_hip.*
-*.egg-info
\ No newline at end of file
+*.egg-info
+.lam_dev_state
\ No newline at end of file
diff --git a/ep/Makefile b/ep/Makefile
index 93aa112a6..28c121871 100644
--- a/ep/Makefile
+++ b/ep/Makefile
@@ -55,8 +55,29 @@ LDFLAGS := -lpthread -lglog -libverbs -lnl-3 -lnl-route-3 -lnuma -Xlinker -rpath
 NVCCFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -pthread -fPIC -fvisibility=hidden" -ccbin /usr/bin/g++ --expt-relaxed-constexpr
 INCLUDES := -Iinclude -I$(CUDA_PATH)/include -I/usr/include -I../include 
 
-CXXFLAGS  += $(EFA_CFLAGS) $(GH_CFLAGS) $(NORMAL_CFLAGS)
-NVCCFLAGS += $(EFA_CFLAGS) $(GH_CFLAGS) $(NORMAL_CFLAGS)
+# LAM_DEV: Enable Lam's development/debug code (set LAM_DEV=1 to enable)
+# Auto-detects changes and only rebuilds affected files
+LAM_DEV ?= 0
+LAM_DEV_STATE_FILE := .lam_dev_state
+LAM_DEV_CURRENT := $(shell cat $(LAM_DEV_STATE_FILE) 2>/dev/null || echo 0)
+# Files affected by LAM_DEV: internode_ll.cu, ep_config.hpp (used by rdma.cpp, uccl_ep.cc)
+LAM_DEV_AFFECTED_OBJS := src/internode_ll.o src/rdma.o src/uccl_ep.o
+
+ifneq ($(LAM_DEV),$(LAM_DEV_CURRENT))
+  $(info LAM_DEV changed from $(LAM_DEV_CURRENT) to $(LAM_DEV), will rebuild affected files)
+  $(shell rm -f $(LAM_DEV_AFFECTED_OBJS))
+  $(shell echo $(LAM_DEV) > $(LAM_DEV_STATE_FILE))
+endif
+
+ifeq ($(LAM_DEV),1)
+  LAM_DEV_FLAGS := -DLAM_DEV
+  $(info LAM_DEV enabled)
+else
+  LAM_DEV_FLAGS :=
+endif
+
+CXXFLAGS  += $(EFA_CFLAGS) $(GH_CFLAGS) $(NORMAL_CFLAGS) $(LAM_DEV_FLAGS)
+NVCCFLAGS += $(EFA_CFLAGS) $(GH_CFLAGS) $(NORMAL_CFLAGS) $(LAM_DEV_FLAGS)
 LDFLAGS   += $(EFA_LDFLAGS)
 INCLUDES  += $(EFA_CFLAGS) $(GH_CFLAGS) $(NORMAL_CFLAGS)
 
@@ -106,7 +127,7 @@ install: $(EP_EXT)
 # Clean all generated files
 clean:
 	rm -f $(OBJ_CPP) $(OBJ_CC) $(OBJ_CU) \
-	      $(EP_EXT) \
+	      $(EP_EXT) $(LAM_DEV_STATE_FILE) \
 	      *.d src/*.d bench/*.d src/*.o **/*.hip **/*_hip.*
 
 # Automatically include dependency files if they exist
diff --git a/ep/bench/test_low_latency.py b/ep/bench/test_low_latency.py
index 6498c6a6b..5c266a5ad 100644
--- a/ep/bench/test_low_latency.py
+++ b/ep/bench/test_low_latency.py
@@ -237,8 +237,11 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                 flush=True,
                             )
                             num_times += 1
-                            # for i in range((num_times % 2) + 1):
-                            for i in range(1):
+                            running_count = (num_times % 2) + 1
+                            # Lam: If stop_after_first, just run once to check the code
+                            if stop_after_first:
+                                running_count = 1
+                            for i in range(running_count):
                                 cumulative_local_expert_recv_stats = torch.zeros(
                                     (num_local_experts,), dtype=torch.int, device="cuda"
                                 )
@@ -498,7 +501,9 @@ def test_func(return_recv_hook: bool):
 
     print("✓ All correctness tests passed!", flush=True)
 
-    if skip_benchmark or stop_after_first:
+    # Lam: If stop_after_first, just run once to check the code
+    #  or stop_after_first
+    if skip_benchmark:
         return (hash_value, hash_details) if debug_hash else hash_value
 
     # Calculate bandwidth

From bebaebc8e94b6c52b107ce8ab87661961c593aa8 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:44 -0500
Subject: [PATCH 06/15] add global slot counter for telling senders where the
 rdma send buffer shoould put (for batching)

---
 ep/include/ep_config.hpp |  6 +++
 ep/src/internode_ll.cu   | 86 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/ep/include/ep_config.hpp b/ep/include/ep_config.hpp
index 451b3d27b..92c41fb7f 100644
--- a/ep/include/ep_config.hpp
+++ b/ep/include/ep_config.hpp
@@ -196,8 +196,14 @@ struct LowLatencyLayout {
     size_t num_bytes_per_combine_msg = hidden * sizeof(nv_bfloat16);
 
     // Send buffer
+#ifdef LAM_DEV
+    // Lam: Change the layout to [slot][dst_expert] to enable batched RDMA sends
+    size_t dispatch_send_buffer_bytes =
+        num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+#else
     size_t dispatch_send_buffer_bytes =
         num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+#endif
     size_t combine_send_buffer_bytes = num_experts *
                                        num_max_dispatch_tokens_per_rank *
                                        num_bytes_per_combine_msg;
diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 7cb551bba..d08898456 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -12,6 +12,15 @@ namespace cg = cooperative_groups;
 namespace uccl {
 namespace internode_ll {
 
+#ifdef LAM_DEV
+// Lam: Global lock for debug printing (ensures printf calls don't interleave)
+__device__ int g_print_lock = 0;
+// Lam: Helper macro for conditional kernel arguments
+#define LAM_DEV_ARG(x) x,
+#else
+#define LAM_DEV_ARG(x)
+#endif
+
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 constexpr int kNumMaxWarpGroups = 16;
 #else
@@ -53,6 +62,9 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     int64_t* dispatch_wait_recv_cost_stats, void* rdma_recv_x,
     int* rdma_recv_count, void* rdma_x, void const* x, int64_t const* topk_idx,
     int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
+#ifdef LAM_DEV
+    int* atomic_send_counter_per_expert,
+#endif
     int* next_clean, int64_t* next_clean_second, int num_next_clean_int,
     int num_tokens, int num_max_dispatch_tokens_per_rank, int num_topk,
     int num_experts, int rank, int num_ranks, int num_warp_groups,
@@ -63,6 +75,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     void* atomic_buffer_ptr = nullptr,
     int64_t* rdma_recv_count_internode = nullptr,
     int* grid_sync_barrier_ptr = nullptr) {
+#ifdef LAM_DEV
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    // printf("[LAM_DEV] dispatch called\n");
+  }
+#endif
   auto const sm_id = static_cast<int>(blockIdx.x);
   auto const thread_id = static_cast<int>(threadIdx.x);
   auto const warp_id = thread_id / WARP_SIZE, lane_id = get_lane_id();
@@ -98,6 +115,13 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   // Expert counts
   __shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups];
 
+#ifdef LAM_DEV
+  // Lam: Send slots for each topk destination (for batched send buffer layout)
+  constexpr int kNumMaxTopK = 9;
+  __shared__ int shared_send_slots[kNumMaxTopK];
+  __shared__ int shared_dst_experts[kNumMaxTopK];
+#endif
+
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
   // initialize barrier
   amd::barrier_init(1);
@@ -136,6 +160,41 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                              : -1;
       thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
 
+#ifdef LAM_DEV
+      // Lam: Allocate send slots for each topk destination
+      // Each warp (warp_id < num_topk) allocates a slot for its destination expert
+      if (warp_id < num_topk && lane_id == 0) {
+        shared_dst_experts[warp_id] = dst_expert_idx;
+        if (dst_expert_idx >= 0) {
+          shared_send_slots[warp_id] =
+              atomicAdd(atomic_send_counter_per_expert + dst_expert_idx, 1);
+        } else {
+          shared_send_slots[warp_id] = -1;
+        }
+      }
+      // Sync to make shared_send_slots visible to all threads
+      sync_barrier_1((num_warps - 1) * WARP_SIZE);
+
+      // // Lam: Debug print: verify send slot allocation (only first SM, first few tokens)
+      // // Uses spinlock to prevent interleaved output from different threads
+      // if (thread_id == 0) {
+      //   // Acquire print lock
+      //   while (atomicCAS(&g_print_lock, 0, 1) != 0);
+      //   __threadfence();
+
+      //   printf("[LAM_DEV] rank=%d token_idx=%d send_slots: ", rank, token_idx);
+      //   for (int k = 0; k < num_topk; k++) {
+      //     printf("(expert=%d, slot=%d) ", shared_dst_experts[k],
+      //            shared_send_slots[k]);
+      //   }
+      //   printf("\n");
+
+      //   __threadfence();
+      //   atomicExch(&g_print_lock, 0);  // Release print lock
+      // }
+      // sync_barrier_1((num_warps - 1) * WARP_SIZE);
+#endif  // LAM_DEV (slot allocation + debug print)
+
 // FP8 cast
 #pragma unroll
       for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
@@ -319,10 +378,29 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr),
                             -num_tokens_sent - 1);
     }
+
+#ifdef LAM_DEV
+    // Lam: Debug print: verify send counter matches expected count
+    // {
+    //   auto const send_counter_val =
+    //       atomic_send_counter_per_expert[responsible_expert_idx];
+    //   if (rank == 0 && responsible_expert_idx < 8) {
+    //     printf(
+    //         "[LAM_DEV] rank=%d expert=%d: expected_count=%d, send_counter=%d, "
+    //         "match=%s\n",
+    //         rank, responsible_expert_idx, num_tokens_sent, send_counter_val,
+    //         (num_tokens_sent == send_counter_val) ? "YES" : "NO");
+    //   }
+    // }
+#endif
+
     // Clean workspace for next use
     atomic_counter_per_expert[responsible_expert_idx] = 0;
     atomic_finish_counter_per_expert[responsible_expert_idx] = 0;
 
+#ifdef LAM_DEV
+    atomic_send_counter_per_expert[responsible_expert_idx] = 0;
+#endif
     // Clean `packed_recv_count`
     if (dst_rank == 0) packed_recv_count[dst_expert_local_idx] = 0;
   }
@@ -546,8 +624,15 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
   auto atomic_counter_per_expert = static_cast<int*>(workspace);
   auto atomic_finish_counter_per_expert =
       atomic_counter_per_expert + num_experts;
+#ifdef LAM_DEV
+  auto atomic_send_counter_per_expert =
+      atomic_finish_counter_per_expert + num_experts;
+  auto grid_sync_barrier_ptr = atomic_send_counter_per_expert + num_experts;
+  EP_HOST_ASSERT((num_experts * 3 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
+#else
   auto grid_sync_barrier_ptr = atomic_finish_counter_per_expert + num_experts;
   EP_HOST_ASSERT((num_experts * 2 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
+#endif
 
   // FP8 checks
   if (use_ue8m0)
@@ -565,6 +650,7 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         cumulative_local_expert_recv_stats, dispatch_wait_recv_cost_stats,    \
         rdma_recv_x, rdma_recv_count, rdma_x, x, topk_idx,                    \
         atomic_counter_per_expert, atomic_finish_counter_per_expert,          \
+        LAM_DEV_ARG(atomic_send_counter_per_expert)                         \
         next_clean, next_clean_second, num_next_clean_int, num_tokens,        \
         num_max_dispatch_tokens_per_rank, num_topk, num_experts, rank,        \
         num_ranks, num_warp_groups, num_warps_per_group, round_scale, phases, \

From a0fae5072b6732becd82bbf7a5b5e0a054bfb0ba Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:44 -0500
Subject: [PATCH 07/15] add personal scripts and docs

---
 .devcontainer/devcontainer.json               |  53 --
 ep/lam/docs/data.md                           |  21 +
 ep/lam/docs/dispatch_buffer_trace.md          | 150 ++++++
 ep/lam/docs/lam_understanding.md              | 497 ++++++++++++++++++
 ep/lam/setup_scripts/setup_lam_conda_local.sh |  17 +
 5 files changed, 685 insertions(+), 53 deletions(-)
 delete mode 100644 .devcontainer/devcontainer.json
 create mode 100644 ep/lam/docs/data.md
 create mode 100644 ep/lam/docs/dispatch_buffer_trace.md
 create mode 100644 ep/lam/docs/lam_understanding.md
 create mode 100755 ep/lam/setup_scripts/setup_lam_conda_local.sh

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
deleted file mode 100644
index 6345310e8..000000000
--- a/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-    "name": "lam_rocm",
-    "image": "docker.io/rocm/megatron-lm:v25.9_gfx950",
-    
-    "remoteUser": "root",
-    
-    // Container settings
-    "runArgs": [
-        "--ipc=host",
-        "--network=host",
-        "--shm-size=64GB",
-        "--cap-add=SYS_PTRACE",
-        "--cap-add=CAP_SYS_ADMIN",
-        "--cap-add=IPC_LOCK",
-        "--security-opt=seccomp=unconfined",
-        "--group-add=video",
-        "--privileged",
-        "--device=/dev/kfd",
-        "--device=/dev/dri",
-        "--device=/dev/infiniband",
-        "--ulimit=memlock=-1:-1",
-        "--ulimit=stack=67108864:67108864"
-    ],
-    
-    // Volume mounts
-    "mounts": [
-        "source=${localEnv:HOME}/.ssh,target=/root/.ssh,type=bind,readonly",
-        "source=/sys/class/infiniband,target=/sys/class/infiniband,type=bind",
-        "source=/dev/hugepages,target=/dev/hugepages,type=bind",
-        "source=/usr/lib/x86_64-linux-gnu/libionic.so.1.0.54.0-164.g21c72dcad,target=/usr/lib/x86_64-linux-gnu/libionic.so.1.0.54.0-164.g21c72dcad,type=bind",
-        "source=/usr/lib/x86_64-linux-gnu/libionic.so.1,target=/usr/lib/x86_64-linux-gnu/libionic.so.1,type=bind",
-        "source=/usr/lib/x86_64-linux-gnu/libionic.so,target=/usr/lib/x86_64-linux-gnu/libionic.so,type=bind",
-        "source=/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so,target=/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so,type=bind",
-        "source=/etc/libibverbs.d/ionic.driver,target=/etc/libibverbs.d/ionic.driver,type=bind"
-    ],
-    
-    // Working directory inside container
-    "workspaceFolder": "/io",
-    "workspaceMount": "source=${localWorkspaceFolder},target=/io,type=bind",
-    
-    // Post-create command to install dependencies
-    "postCreateCommand": "apt-get update && apt-get install -y libibverbs-dev libgoogle-glog-dev && cd ep && ./install_deps.sh && python setup.py install",
-    
-    // Optional: VS Code / Cursor extensions to install in container
-    "customizations": {
-        "vscode": {
-            "extensions": [
-                "ms-python.python",
-                "ms-python.vscode-pylance"
-            ]
-        }
-    }
-}
diff --git a/ep/lam/docs/data.md b/ep/lam/docs/data.md
new file mode 100644
index 000000000..db856551f
--- /dev/null
+++ b/ep/lam/docs/data.md
@@ -0,0 +1,21 @@
+# The original EFA performance
+
+[rank 0] Dispatch bandwidth: 2.23 GB/s, avg_t=3373.00 us | Combine bandwidth: 19.58 GB/s, avg_t=742.49 us
+[rank 1] Dispatch bandwidth: 2.14 GB/s, avg_t=3509.00 us | Combine bandwidth: 23.94 GB/s, avg_t=607.13 us
+[rank 2] Dispatch bandwidth: 2.26 GB/s, avg_t=3324.00 us | Combine bandwidth: 18.35 GB/s, avg_t=792.12 us
+[rank 3] Dispatch bandwidth: 2.22 GB/s, avg_t=3381.00 us | Combine bandwidth: 19.78 GB/s, avg_t=734.92 us
+[rank 4] Dispatch bandwidth: 2.17 GB/s, avg_t=3468.00 us | Combine bandwidth: 22.40 GB/s, avg_t=648.84 us
+[rank 5] Dispatch bandwidth: 2.14 GB/s, avg_t=3517.00 us | Combine bandwidth: 24.33 GB/s, avg_t=597.45 us
+[rank 6] Dispatch bandwidth: 2.10 GB/s, avg_t=3575.00 us | Combine bandwidth: 26.78 GB/s, avg_t=542.78 us
+[rank 7] Dispatch bandwidth: 2.09 GB/s, avg_t=3590.00 us | Combine bandwidth: 27.53 GB/s, avg_t=528.00 us
+
+# Lam
+
+[rank 0] Dispatch bandwidth: 2.32 GB/s, avg_t=3233.00 us | Combine bandwidth: 17.23 GB/s, avg_t=843.55 us
+[rank 1] Dispatch bandwidth: 2.19 GB/s, avg_t=3423.00 us | Combine bandwidth: 22.30 GB/s, avg_t=651.91 us
+[rank 2] Dispatch bandwidth: 2.31 GB/s, avg_t=3246.00 us | Combine bandwidth: 17.51 GB/s, avg_t=830.24 us
+[rank 3] Dispatch bandwidth: 2.22 GB/s, avg_t=3377.00 us | Combine bandwidth: 20.75 GB/s, avg_t=700.61 us
+[rank 4] Dispatch bandwidth: 2.22 GB/s, avg_t=3380.00 us | Combine bandwidth: 20.81 GB/s, avg_t=698.59 us
+[rank 5] Dispatch bandwidth: 2.26 GB/s, avg_t=3325.00 us | Combine bandwidth: 19.31 GB/s, avg_t=752.92 us
+[rank 6] Dispatch bandwidth: 2.14 GB/s, avg_t=3509.00 us | Combine bandwidth: 25.64 GB/s, avg_t=567.05 us
+[rank 7] Dispatch bandwidth: 2.12 GB/s, avg_t=3545.00 us | Combine bandwidth: 27.31 GB/s, avg_t=532.25 us
\ No newline at end of file
diff --git a/ep/lam/docs/dispatch_buffer_trace.md b/ep/lam/docs/dispatch_buffer_trace.md
new file mode 100644
index 000000000..c60932e1f
--- /dev/null
+++ b/ep/lam/docs/dispatch_buffer_trace.md
@@ -0,0 +1,150 @@
+# Trace: `x` and `rdma_channel_data.send_buffer` (high-throughput dispatch)
+
+## 1. Trace: `x` (original memory — read source)
+
+### With code lines
+
+| 層級 | 變數 / 來源 | File | Line |
+|------|-------------|------|------|
+| **Python** | 使用者傳入的 `x`（`torch.Tensor`，shape `[num_tokens, hidden]`） | `buffer.py` | 906-907 (參數 `x`) |
+| **Python** | `x, x_scales = x if isinstance(x, tuple) else (x, None)` | `buffer.py` | 935 |
+| **Python** | `self.runtime.internode_dispatch(x, x_scales, ...)` | `buffer.py` | 954, 1007 |
+| **C++** | `Buffer::internode_dispatch(torch::Tensor const& x, ...)` | `uccl_ep.cc` | 401 (宣告), 416 (參數 `x`) |
+| **C++** | 傳給 kernel 的 pointer：`x.data_ptr()` | `uccl_ep.cc` | 697 |
+| **C++ (host)** | `dispatch(..., void const* x, ...)` | `internode.cu` | 1516 (參數 `x`) |
+| **C++ (host)** | launch 時轉型：`reinterpret_cast<int4 const*>(x)` | `internode.cu` | 1565 |
+| **C++ (kernel)** | 參數 `int4 const* x` | `internode.cu` | 482 |
+| **C++ (kernel)** | 讀取位址 `x + token_idx * hidden_int4` | `internode.cu` | 854 |
+
+### In kernel (device)
+
+| Layer | Variable / Expression | File:Line |
+|-------|----------------------|-----------|
+| Dispatch kernel param | `x` (type: `int4 const*`) | `internode.cu:482` |
+| Copy source | `x + token_idx * hidden_int4` | `internode.cu:854` |
+
+### Host launch (C++)
+
+| Layer | Variable / Expression | File:Line |
+|-------|----------------------|-----------|
+| Kernel launch | `reinterpret_cast<int4 const*>(x)` | `internode.cu:1565` |
+| Host `dispatch()` param | `x` (type: `void const*`) | `internode.cu:1516` |
+| Caller | `x.data_ptr()` | `uccl_ep.cc:697` |
+| Caller method | `Buffer::internode_dispatch(torch::Tensor const& x, ...)` | `uccl_ep.cc:401, 416` |
+
+### Python
+
+| Layer | Variable / Expression | File:Line |
+|-------|----------------------|-----------|
+| `Buffer.internode_dispatch` arg | `x` (type: `Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]`) | `buffer.py:906-907` |
+| Unpack if tuple | `x, x_scales = x if isinstance(x, tuple) else (x, None)` | `buffer.py:935` |
+| Passed to C++ | `self.runtime.internode_dispatch(x, x_scales, ...)` | `buffer.py:954, 1007` |
+
+### Chain summary (x)
+
+```
+User / caller
+  → Python:  x  (torch.Tensor, shape [num_tokens, hidden])
+  → buffer.py:  internode_dispatch(self, x, ...)  →  self.runtime.internode_dispatch(x, ...)
+  → uccl_ep.cc:  Buffer::internode_dispatch(torch::Tensor const& x, ...)  →  x.data_ptr()
+  → internode.cu:  dispatch(..., void const* x, ...)  →  reinterpret_cast<int4 const*>(x)
+  → kernel:  int4 const* x  →  read at  x + token_idx * hidden_int4
+```
+
+So **`x`** is the **input activation tensor** from the MoE layer (user-provided), in GPU memory; the kernel reads from it.
+
+---
+
+## 2. Trace: `rdma_channel_data.send_buffer` (new contiguous memory — write destination)
+
+### With code lines
+
+| 層級 | 變數 / 來源 | File | Line |
+|------|-------------|------|------|
+| **Python** | `num_rdma_bytes` 傳入 `Buffer(..., num_rdma_bytes=...)` | `buffer.py` | 59 (參數) |
+| **Python** | `self.scratch = torch.zeros(num_rdma_bytes, dtype=torch.uint8, device=...)` | `buffer.py` | 92-94 |
+| **Python** | (ROCm) `self.scratch = ep.get_rdma_buffer(num_rdma_bytes, device_index)` | `buffer.py` | 96 |
+| **Python** | `rdma_buffer_ptr = self.scratch.data_ptr()` | `buffer.py` | 98 |
+| **Python** | `self.runtime.set_rdma_buffer_raw(rdma_buffer_ptr)` | `buffer.py` | 128 |
+| **C++** | Python binding 呼叫 `self.set_rdma_buffer_raw(addr)` → `Buffer::set_rdma_buffer_raw(void* ptr)` | `uccl_ep.cc` | 2059-2063 |
+| **C++** | `void Buffer::set_rdma_buffer_raw(void* ptr)` → `rdma_buffer_ptr = ptr` | `uccl_ep.cc` | 1865-1869 |
+| **C++** | Member：`void* Buffer::rdma_buffer_ptr` | `uccl_ep.cc` | 1912 |
+| **C++** | `uccl::internode::dispatch(..., rdma_buffer_ptr, ...)` 傳入的參數 | `uccl_ep.cc` | 710 |
+| **C++ (host)** | `void dispatch(..., void* rdma_buffer_ptr, ...)` | `internode.cu` | 1523 (參數) |
+| **C++ (host)** | Kernel launch：`dispatch_func(..., rdma_buffer_ptr, ...)` | `internode.cu` | 1569-1570 |
+| **C++ (kernel)** | 參數 `void* rdma_buffer_ptr` | `internode.cu` | 494 |
+| **C++ (kernel)** | `auto rdma_channel_data = SymBuffer<uint8_t>(rdma_buffer_ptr, ...)` | `internode.cu` | 552-554 |
+| **C++ (kernel)** | RDMASender 用：`send_buffer` = `rdma_channel_data.send_buffer(lane_id)` 或 `.recv_buffer(lane_id)` | `internode.cu` | 725-727 |
+| **C++ (kernel)** | Coordinator put 用：`rdma_channel_data.send_buffer(dst_rdma_rank)` | `internode.cu` | 1034 |
+| **C++ (buffer.cuh)** | `SymBuffer::send_ptr` = `gbl_ptr + per_channel_bytes * sm_id` | `buffer.cuh` | 125 |
+| **C++ (buffer.cuh)** | `dtype_t* send_buffer(int idx)` → `send_ptr + num_bytes * idx` | `buffer.cuh` | 132-135 |
+
+### In kernel (device)
+
+| Layer | Variable / Expression | File:Line |
+|-------|----------------------|-----------|
+| Send buffer (per lane) | `send_buffer` = `rdma_channel_data.recv_buffer(lane_id)` or `.send_buffer(lane_id)` | `internode.cu:725-727` |
+| Per-rank send buffer | `rdma_channel_data.send_buffer(dst_rdma_rank)` | `internode.cu:1034` |
+| `rdma_channel_data` | `SymBuffer<uint8_t>(rdma_buffer_ptr, num_max_rdma_chunked_recv_tokens * num_bytes_per_token, kNumRDMARanks, channel_id, num_channels)` | `internode.cu:552-554` |
+| Kernel param | `rdma_buffer_ptr` (type: `void*`) | `internode.cu:494` |
+
+### SymBuffer: where `send_buffer` points (buffer.cuh)
+
+| Member | Meaning |
+|--------|--------|
+| `SymBuffer(void*& gbl_ptr, int num_elems, int num_ranks, int sm_id, int num_sms)` | Carves a region from `gbl_ptr`; **advances** `gbl_ptr` by `total_bytes`. |
+| `send_ptr` | `gbl_ptr + per_channel_bytes * sm_id` (per-channel offset). |
+| `send_buffer(int idx)` | `send_ptr + num_bytes * idx` → one contiguous region per rank index. |
+
+So **`rdma_channel_data.send_buffer(idx)`** is a **subregion of the memory pointed to by `rdma_buffer_ptr`** when the kernel was launched.
+
+### Host launch (C++)
+
+| Layer | Variable / Expression | File:Line |
+|-------|----------------------|-----------|
+| Kernel launch | `rdma_buffer_ptr` passed to `dispatch(...)` | `internode.cu:1569` |
+| Host `dispatch()` param | `void* rdma_buffer_ptr` | `internode.cu:1523` |
+| Caller | `rdma_buffer_ptr` (no cast) | `uccl_ep.cc:710` |
+| Source | `Buffer` member: `rdma_buffer_ptr` | `uccl_ep.cc:1912` |
+
+### Where `Buffer::rdma_buffer_ptr` is set
+
+| Layer | Variable / Expression | File:Line |
+|-------|----------------------|-----------|
+| Setter | `void Buffer::set_rdma_buffer_raw(void* ptr)` → `rdma_buffer_ptr = ptr` | `uccl_ep.cc:1865-1869` |
+| Called from | Python binding / C++ init code that holds the actual allocation | `uccl_ep.cc:2060-2064` |
+
+### Python
+
+| Layer | Variable / Expression | File:Line |
+|-------|----------------------|-----------|
+| Allocation (CUDA) | `self.scratch = torch.zeros(num_rdma_bytes, dtype=torch.uint8, device=f"cuda:{device_index}")` | `buffer.py:92-94` |
+| Allocation (ROCm) | `self.scratch = ep.get_rdma_buffer(num_rdma_bytes, device_index)` | `buffer.py:96` |
+| Pointer | `rdma_buffer_ptr = self.scratch.data_ptr()` | `buffer.py:98` |
+| Passed to C++ | `initialize_uccl(rdma_buffer_ptr, num_rdma_bytes, ...)` and later `self.runtime.set_rdma_buffer_raw(rdma_buffer_ptr)` | `buffer.py:100, 128` |
+| `num_rdma_bytes` | Passed into `Buffer.__init__(..., num_rdma_bytes=...)`; typically from `config.get_rdma_buffer_size_hint(...)` at call sites | `buffer.py:59` |
+
+### Chain summary (rdma_channel_data.send_buffer)
+
+```
+User / config
+  → Python:  num_rdma_bytes  (e.g. from config.get_rdma_buffer_size_hint(hidden_bytes, num_ranks))
+  → buffer.py:  Buffer(..., num_rdma_bytes)  →  self.scratch = torch.zeros(num_rdma_bytes, ...)  →  rdma_buffer_ptr = self.scratch.data_ptr()
+  → buffer.py:  self.runtime.set_rdma_buffer_raw(rdma_buffer_ptr)
+  → uccl_ep.cc:  Buffer::rdma_buffer_ptr  (member)
+  → uccl_ep.cc:  internode_dispatch(..., rdma_buffer_ptr, ...)  →  uccl::internode::dispatch(..., rdma_buffer_ptr, ...)
+  → internode.cu:  dispatch(..., void* rdma_buffer_ptr, ...)  →  kernel param rdma_buffer_ptr
+  → kernel:  rdma_channel_data = SymBuffer<uint8_t>(rdma_buffer_ptr, ...)  →  send_ptr/send_buffer(idx) are offsets into that block
+  → rdma_channel_data.send_buffer(dst_rdma_rank)  =  base of the contiguous send region for that rank
+```
+
+So **`rdma_channel_data.send_buffer`** is the **contiguous send staging buffer** for each destination rank, carved out of the **RDMA buffer** that was allocated in Python (`self.scratch`) and set on the C++ `Buffer` via **`set_rdma_buffer_raw`**.
+
+---
+
+## 3. Quick reference
+
+| What | Origin |
+|------|--------|
+| **x** | User’s input tensor (Python `x` → `x.data_ptr()` → kernel `x`). |
+| **rdma_channel_data.send_buffer** | A region inside the RDMA buffer whose base pointer is set by Python (`scratch.data_ptr()` → `set_rdma_buffer_raw`) and passed into the kernel as `rdma_buffer_ptr`; layout is built by `SymBuffer` in the kernel. |
diff --git a/ep/lam/docs/lam_understanding.md b/ep/lam/docs/lam_understanding.md
new file mode 100644
index 000000000..98dc37cd4
--- /dev/null
+++ b/ep/lam/docs/lam_understanding.md
@@ -0,0 +1,497 @@
+# rdma_recv_x Layout and Usage
+
+`rdma_recv_x` is the receive data buffer used in the low-latency dispatch phase (`ep/src/internode_ll.cu`). Each rank has its **own** buffer in its **own** GPU memory; when rank A sends to an expert on rank B, A does an RDMA put **into B’s** `rdma_recv_x`.
+
+---
+
+## Dispatch send flow (sender side)
+
+- **Input** goes into **`rdma_x`** (send buffer): each token’s hidden is read from input `x`, cast/copied into `rdma_x[token_idx]` (one message per token, `num_bytes_per_msg` each).
+- **Multiple SMs**: the grid runs many blocks (SMs). Tokens are split across SMs with:
+  ```cpp
+  for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms)
+  ```
+  So each SM handles every `num_sms`-th token (e.g. SM 0 → token 0, num_sms, 2*num_sms, …).
+- **Per token, one warp per top-k**: for each token, `num_topk` warps (warp_id 0 .. num_topk-1) each take one top-k destination. Each warp reads `dst_expert_idx = topk_idx[token_idx][warp_id]`, allocates a `slot_idx` for that expert, then puts one message from `rdma_x[token_idx]` to the receiver’s `rdma_recv_x` at the slot given by the layout below.
+- **Write to dst**: the remote address is computed with the **rdma_recv_x layout** (see next section): `dst_ptr = rdma_recv_x + dst_expert_local_idx * ... + rank * ... + slot_idx * num_bytes_per_msg`, and the sender does an RDMA put (or IPC copy) of that one message to `dst_ptr`.
+
+---
+
+## Count send (after token sends)
+
+After all token puts to each expert are done, the sender issues a **count send** for that expert.
+
+- **Not a broadcast to all ranks**: one count is sent **per expert**, to the **rank that owns that expert** (`dst_rank = expert_idx / num_local_experts`). So we send one count to the owner of expert 0, one to the owner of expert 1, etc. Each rank only receives counts for its own local experts (from each source rank).
+- **Content**: “I (this rank) sent you (dst_rank) this many tokens for expert `dst_expert_local_idx`” — i.e. the number of tokens this rank routed to that expert (`num_tokens_sent`), encoded as `-num_tokens_sent - 1` and written into the receiver’s `rdma_recv_count[dst_expert_local_idx][rank]` (or `rdma_recv_count_internode` for cross-node). The receiver polls until non-zero, then decodes `num_tokens = -value - 1` and uses that to know how many messages to read from `rdma_recv_x` for that (expert, source_rank).
+
+---
+
+## Layout
+
+**Logical 3D layout** (on the rank that owns the buffer):
+
+```
+rdma_recv_x[expert_local_idx][source_rank][slot_idx]
+```
+
+| Dimension | Index | Size | Meaning |
+|-----------|--------|------|---------|
+| 0 | `expert_local_idx` | `num_local_experts` | Which local expert on this rank |
+| 1 | `source_rank` | `num_ranks` | Which rank sent the data |
+| 2 | `slot_idx` | `num_max_dispatch_tokens_per_rank` | Which message slot from that source |
+
+- **One slot** = `num_bytes_per_msg` bytes (control `int4` + hidden + scales).
+- **Total size per GPU:** `num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg`.
+
+Declared in `ep/include/ep_config.hpp` as `LowLatencyBuffer::dispatch_rdma_recv_data_buffer`; allocated from `rdma_buffer` in `LowLatencyLayout`, passed to the kernel as `rdma_recv_x`.
+
+---
+
+## Usage
+
+**Sender (writing):** The sender computes the **remote** address on the receiver’s `rdma_recv_x` with:
+
+```cpp
+dst_ptr = rdma_recv_x
+  + dst_expert_local_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg
+  + rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg
+  + slot_idx * num_bytes_per_msg;
+```
+
+So the sender uses `(dst_expert_local_idx, rank, slot_idx)` to pick one slot in the receiver’s buffer and does an RDMA put (or IPC copy) of one message there.
+
+**Receiver (reading):** The receiver waits for the send count for each `(local_expert_idx, src_rank)`, then reads that many messages from the corresponding block:
+
+```cpp
+rdma_recv_x_uint8 = rdma_recv_x
+  + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg
+  + src_rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+// then read slots 0 .. num_recv_tokens - 1 (each num_bytes_per_msg)
+```
+
+So the receiver uses the same layout: for each of its local experts and each source rank, it reads `num_recv_tokens` contiguous slots from that block and packs them into `packed_recv_x` etc.
+
+---
+
+## Dispatch Receive Flow (receiver side)
+
+### Overview
+
+The receiver side of dispatch:
+1. **Polls** for incoming tokens from each `(local_expert, src_rank)` pair
+2. **Copies** tokens from `rdma_recv_x` (RDMA buffer, may have gaps) to `packed_recv_x` (contiguous output buffer)
+3. Records metadata for the combine phase
+
+### Input / Output
+
+| Buffer | Type | Description |
+|--------|------|-------------|
+| **Input:** `rdma_recv_x` | RDMA recv buffer | Organized by `[local_expert][src_rank][slot]`, may have unused slots |
+| **Input:** `rdma_recv_count` | Atomic counters | Token counts per `(local_expert, src_rank)`, set by senders |
+| **Output:** `packed_recv_x` | Packed buffer | Contiguous tokens per expert, ready for Expert Forward |
+| **Output:** `packed_recv_src_info` | int array | Original token index at source rank (for combine phase) |
+| **Output:** `packed_recv_count` | int array | Total tokens received per local expert |
+| **Output:** `packed_recv_layout_range` | int64 array | `(num_tokens, begin_idx)` per `(local_expert, src_rank)` |
+
+### Parallelization Hierarchy
+
+```
+Block (SM)
+└── Warp Group (handles one (local_expert, src_rank) pair)
+    └── Sub-warp (= 1 warp = 32 threads, handles multiple tokens in strided fashion)
+        └── Lane (= 1 thread, handles part of one token's hidden data)
+```
+
+> **Note:** "Sub-warp" and "warp" are the **same thing** (32 threads). The naming depends on perspective:
+> - From **block** perspective: called "warp" (`warp_id = thread_id / 32`)
+> - From **warp group** perspective: called "sub-warp" (`sub_warp_id = warp_id % num_warps_per_group`)
+>
+> A warp group contains multiple warps; each warp within the group is called a "sub-warp" to indicate it's a sub-unit of the group.
+
+### ASCII Diagram: Block Structure
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                              Block 0 (SM 0)                                 │
+│                         blockDim.x = 1024 threads                           │
+│                                                                             │
+│  ┌─────────────────────────────────────┐ ┌─────────────────────────────────┐│
+│  │         Warp Group 0                │ │         Warp Group 1            ││
+│  │      (handles Expert Pair 0)        │ │      (handles Expert Pair 1)    ││
+│  │  num_warps_per_group = 4            │ │  num_warps_per_group = 4        ││
+│  │                                     │ │                                 ││
+│  │  ┌─────────────────────────────┐    │ │  ┌─────────────────────────────┐││
+│  │  │ Sub-warp 0 (warp_id=0)     │    │ │  │ Sub-warp 0 (warp_id=4)     │││
+│  │  │ ┌──┬──┬──┬──┬─────┬──┬──┐  │    │ │  │ 32 threads                 │││
+│  │  │ │T0│T1│T2│T3│ ... │T30│T31│  │    │ │  └─────────────────────────────┘││
+│  │  │ └──┴──┴──┴──┴─────┴──┴──┘  │    │ │  ┌─────────────────────────────┐││
+│  │  │      32 threads (lanes)    │    │ │  │ Sub-warp 1 (warp_id=5)     │││
+│  │  └─────────────────────────────┘    │ │  │ 32 threads (waiter+copier) │││
+│  │  ┌─────────────────────────────┐    │ │  └─────────────────────────────┘││
+│  │  │ Sub-warp 1 (warp_id=1)     │    │ │  ┌─────────────────────────────┐││
+│  │  │ 32 threads                 │    │ │  │ Sub-warp 2 (warp_id=6)     │││
+│  │  │ ★ waiter + copy tokens     │    │ │  │ 32 threads                 │││
+│  │  └─────────────────────────────┘    │ │  └─────────────────────────────┘││
+│  │  ┌─────────────────────────────┐    │ │  ┌─────────────────────────────┐││
+│  │  │ Sub-warp 2 (warp_id=2)     │    │ │  │ Sub-warp 3 (warp_id=7)     │││
+│  │  │ 32 threads                 │    │ │  │ 32 threads                 │││
+│  │  └─────────────────────────────┘    │ │  └─────────────────────────────┘││
+│  │  ┌─────────────────────────────┐    │ │                                 ││
+│  │  │ Sub-warp 3 (warp_id=3)     │    │ │                                 ││
+│  │  │ 32 threads                 │    │ │                                 ││
+│  │  └─────────────────────────────┘    │ │                                 ││
+│  └─────────────────────────────────────┘ └─────────────────────────────────┘│
+└─────────────────────────────────────────────────────────────────────────────┘
+
+Index calculations:
+┌────────────────────────────────────────────────────────────────┐
+│ thread_id     = threadIdx.x                   (0 ~ 1023)       │
+│ warp_id       = thread_id / 32                (0 ~ 31)         │
+│ lane_id       = thread_id % 32                (0 ~ 31)         │
+│                                                                │
+│ warp_group_id = warp_id / num_warps_per_group (which group)    │
+│ sub_warp_id   = warp_id % num_warps_per_group (which in group) │
+│                                                                │
+│ responsible_expert_idx = sm_id * num_warp_groups + warp_group_id│
+└────────────────────────────────────────────────────────────────┘
+```
+
+### ASCII Diagram: (local_expert, src_rank) Distribution
+
+Each `responsible_expert_idx` maps to one `(local_expert, src_rank)` pair:
+
+```
+responsible_expert_idx ∈ [0, num_experts - 1]
+
+src_rank         = responsible_expert_idx / num_local_experts
+local_expert_idx = responsible_expert_idx % num_local_experts
+
+Example: num_local_experts = 9, num_ranks = 32, num_experts = 288
+
+┌───────────────────────────────────────────────────────────────┐
+│ idx │ src_rank │ local_expert │        Meaning                │
+├─────┼──────────┼──────────────┼───────────────────────────────┤
+│  0  │    0     │      0       │ Recv from Rank0 for Expert0   │
+│  1  │    0     │      1       │ Recv from Rank0 for Expert1   │
+│ ... │   ...    │     ...      │                               │
+│  8  │    0     │      8       │ Recv from Rank0 for Expert8   │
+│  9  │    1     │      0       │ Recv from Rank1 for Expert0   │
+│ 10  │    1     │      1       │ Recv from Rank1 for Expert1   │
+│ ... │   ...    │     ...      │                               │
+│ 287 │   31     │      8       │ Recv from Rank31 for Expert8  │
+└───────────────────────────────────────────────────────────────┘
+
+Total 288 pairs = 288 warp groups needed
+```
+
+### ASCII Diagram: Receive Matrix (on one rank)
+
+```
+Rank 0 as Receiver, has 9 local experts (Expert 0~8)
+Needs to receive from 32 src_ranks (including itself)
+
+┌─────────────────────────────────────────────────────────────────────────┐
+│                         Rank 0's Receive Matrix                         │
+│                                                                         │
+│              src_rank                                                   │
+│         0    1    2    3   ...   31                                     │
+│       ┌────┬────┬────┬────┬─────┬────┐                                  │
+│     0 │ WG │ WG │ WG │ WG │ ... │ WG │  ← recv from each rank for Exp0  │
+│       ├────┼────┼────┼────┼─────┼────┤                                  │
+│     1 │ WG │ WG │ WG │ WG │ ... │ WG │  ← recv from each rank for Exp1  │
+│  l    ├────┼────┼────┼────┼─────┼────┤                                  │
+│  o  2 │ WG │ WG │ WG │ WG │ ... │ WG │                                  │
+│  c    ├────┼────┼────┼────┼─────┼────┤                                  │
+│  a  3 │ WG │ WG │ WG │ WG │ ... │ WG │                                  │
+│  l    ├────┼────┼────┼────┼─────┼────┤                                  │
+│     4 │ WG │ WG │ WG │ WG │ ... │ WG │                                  │
+│  e    ├────┼────┼────┼────┼─────┼────┤                                  │
+│  x  5 │ WG │ WG │ WG │ WG │ ... │ WG │                                  │
+│  p    ├────┼────┼────┼────┼─────┼────┤                                  │
+│  e  6 │ WG │ WG │ WG │ WG │ ... │ WG │                                  │
+│  r    ├────┼────┼────┼────┼─────┼────┤                                  │
+│  t  7 │ WG │ WG │ WG │ WG │ ... │ WG │                                  │
+│       ├────┼────┼────┼────┼─────┼────┤                                  │
+│     8 │ WG │ WG │ WG │ WG │ ... │ WG │                                  │
+│       └────┴────┴────┴────┴─────┴────┘                                  │
+│                                                                         │
+│   Each cell = 1 Warp Group = 1 (local_expert, src_rank) pair            │
+│   Total: 9 × 32 = 288 Warp Groups                                       │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+### ASCII Diagram: Sub-warp Token Distribution
+
+Within a warp group, sub-warps divide tokens by index:
+
+```
+Warp Group for (local_expert=2, src_rank=5) receives 30 tokens
+num_warps_per_group = 10
+
+Token distribution (for loop: i = sub_warp_id; i < num_recv_tokens; i += 10):
+┌─────────────────────────────────────────────────────────────────────┐
+│                                                                     │
+│  Sub-warp 0: processes token 0, 10, 20                             │
+│  Sub-warp 1: processes token 1, 11, 21  (also polls for arrival)   │
+│  Sub-warp 2: processes token 2, 12, 22                             │
+│  Sub-warp 3: processes token 3, 13, 23                             │
+│  Sub-warp 4: processes token 4, 14, 24                             │
+│  Sub-warp 5: processes token 5, 15, 25                             │
+│  Sub-warp 6: processes token 6, 16, 26                             │
+│  Sub-warp 7: processes token 7, 17, 27                             │
+│  Sub-warp 8: processes token 8, 18, 28                             │
+│  Sub-warp 9: processes token 9, 19, 29                             │
+│                                                                     │
+│  All 10 sub-warps process 10 different tokens in PARALLEL!         │
+│                                                                     │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+### ASCII Diagram: Single Token Copy (within one sub-warp)
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│               Sub-warp processes 1 Token                            │
+│                                                                     │
+│   Token structure: [src_info: 16B] [hidden: 7168B] [scales: ~56B]  │
+│                                                                     │
+│   32 threads copy in parallel:                                      │
+│   ┌────────────────────────────────────────────────────────────┐   │
+│   │ Lane 0:  src_info + hidden[0:16]                           │   │
+│   │ Lane 1:  hidden[16:32]                                     │   │
+│   │ Lane 2:  hidden[32:48]                                     │   │
+│   │ ...                                                        │   │
+│   │ Lane 31: hidden[496:512]                                   │   │
+│   │                                                            │   │
+│   │ (then loop: Lane 0 copies hidden[512:528], ...)            │   │
+│   │                                                            │   │
+│   │ UNROLLED_WARP_COPY: 32 lanes copy in parallel              │   │
+│   └────────────────────────────────────────────────────────────┘   │
+│                                                                     │
+│   7168B / 32 threads / 16B per load ≈ 14 iterations                │
+│                                                                     │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+### Understanding token_idx: Local Index on Sender
+
+**IMPORTANT:** `token_idx` in the recv logs is the **sender's local token index**, NOT a global index.
+
+```
+Each rank has its OWN set of input tokens (e.g., 128 tokens per rank):
+
+┌─────────────────────────────────────────────────────────────────────┐
+│  Rank 0: token 0, 1, 2, ..., 83, ..., 127   (local to Rank 0)      │
+│  Rank 1: token 0, 1, 2, ..., 83, ..., 127   (local to Rank 1)      │
+│  Rank 2: token 0, 1, 2, ..., 83, ..., 127   (local to Rank 2)      │
+│  ...                                                                │
+│  Rank 8: token 0, 1, 2, ..., 83, ..., 127   (local to Rank 8)      │
+│                                                                     │
+│  These are DIFFERENT tokens with the same local index!             │
+└─────────────────────────────────────────────────────────────────────┘
+
+Example log interpretation:
+  [RECV] rank=0 expert=156 src_rank=8 slot=3 token_idx=83 ...
+  
+  This means:
+  - Rank 0 received a token
+  - The token came from Rank 8
+  - On Rank 8, this token was token #83 (out of Rank 8's 128 tokens)
+  - token_idx=83 is Rank 8's LOCAL index, not global
+```
+
+**Why multiple logs show the same token_idx (e.g., token_idx=83)?**
+
+```
+If you see 11 recv logs all with token_idx=83, they are 11 DIFFERENT tokens:
+  - Rank 0's token #83, Rank 2's token #83, Rank 6's token #83, etc.
+  - They happen to share the same local index on their respective ranks
+  - They are routed to different experts on Rank 0 based on top-k routing
+
+Example (top-k=2, 128 tokens per rank):
+┌─────────────────────────────────────────────────────────────────────┐
+│  Rank 6's token #83:                                               │
+│    → top-k routing sends to Expert 3 (on Rank 0)  → expert=111     │
+│    → top-k routing sends to Expert 10 (on Rank 0) → expert=118     │
+│                                                                     │
+│  Rank 8's token #83:                                               │
+│    → top-k routing sends to Expert 5 (on Rank 0)  → expert=149     │
+│    → top-k routing sends to Expert 12 (on Rank 0) → expert=156     │
+│                                                                     │
+│  These are 4 different recv operations for 2 different tokens      │
+│  (each token sent to 2 experts due to top-k=2)                     │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+### Complete Data Flow Example
+
+```
+Rank 5 (Sender) has 128 local tokens (token 0 ~ 127):
+┌─────────────────────────────────────────────────────────────────────┐
+│  Input tokens: T0, T1, T2, T3, T4, T5, ..., T83, ..., T127         │
+│                (these are Rank 5's LOCAL tokens)                   │
+│                                                                     │
+│  Top-k routing results (top-k=2, each token → 2 experts):          │
+│    T0 → Expert 2 (on Rank 0), Expert 15 (on Rank 1)               │
+│    T1 → Expert 7 (on Rank 0), Expert 22 (on Rank 2)               │
+│    T2 → Expert 2 (on Rank 0), Expert 31 (on Rank 3)               │
+│    T83 → Expert 4 (on Rank 0), Expert 19 (on Rank 1)              │
+│    ...                                                              │
+│                                                                     │
+│  Sent to Rank 0: T0, T1, T2, T83, ... (with their local indices)  │
+└─────────────────────────────────────────────────────────────────────┘
+                              │
+                              ▼
+Rank 0 (Receiver):
+┌─────────────────────────────────────────────────────────────────────┐
+│  Warp Group for (local_expert=2, src_rank=5)                       │
+│                                                                     │
+│  Received tokens from Rank 5 for Expert 2:                         │
+│    slot[0] → token_idx=0  (Rank 5's T0)                            │
+│    slot[1] → token_idx=2  (Rank 5's T2)                            │
+│    ...                                                              │
+│                                                                     │
+│  token_idx is preserved so combine phase can send results back!    │
+│                                                                     │
+│  Sub-warps divide work (assuming 10 sub-warps):                    │
+│    Sub-warp 0: slot[0], slot[10], ...                              │
+│    Sub-warp 1: slot[1], slot[11], ...                              │
+│    ...                                                              │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+**Key formulas:**
+```cpp
+warp_group_id = warp_id / num_warps_per_group;
+sub_warp_id   = warp_id % num_warps_per_group;
+responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
+src_rank = responsible_expert_idx / num_local_experts;
+local_expert_idx = responsible_expert_idx % num_local_experts;
+```
+
+**Configuration (computed at launch):**
+```cpp
+num_warp_groups     = ceil_div(num_experts, num_device_sms);   // warp groups per block
+num_warps_per_group = kNumMaxWarpGroups / num_warp_groups;     // sub-warps per warp group
+num_sms             = ceil_div(num_experts, num_warp_groups);  // blocks to launch
+// kNumMaxWarpGroups = 32 (NVIDIA) or 16 (AMD)
+```
+
+**Example (256 experts, H100 with 132 SMs):**
+- `num_warp_groups = ceil(256/132) = 2` warp groups per block
+- `num_warps_per_group = 32/2 = 16` sub-warps per warp group
+- `num_sms = ceil(256/2) = 128` blocks launched
+- Total: 128 blocks × 2 warp groups = 256 responsible_expert_idx values
+
+### What Each Level Does
+
+| Level | Handles | Description |
+|-------|---------|-------------|
+| **Warp Group** | One `(local_expert, src_rank)` pair | Polls for tokens, then copies all tokens from that src_rank to that expert |
+| **Sub-warp** | Multiple tokens (strided) | Each sub-warp copies tokens `i, i+n, i+2n, ...` where `n = num_warps_per_group` |
+| **Lane** | Part of one token's hidden | 32 lanes cooperatively copy one token's ~7KB hidden data |
+
+### Key Insight: Warp Group = (local_expert, src_rank)
+
+**Each warp group processes exactly one `(local_expert, src_rank)` pair:**
+
+1. **One warp group** is assigned one `responsible_expert_idx` which maps to one `(local_expert, src_rank)` pair
+2. **Many tokens** may arrive from this `src_rank` to this `local_expert` (stored in `num_recv_tokens`)
+3. **All sub-warps in this warp group** cooperatively process these tokens in parallel
+
+```
+Code verification (internode_ll.cu):
+
+Line 527-529: Map responsible_expert_idx → (local_expert, src_rank)
+┌─────────────────────────────────────────────────────────────────────┐
+│ if (responsible_expert_idx < num_experts) {                        │
+│   src_rank = responsible_expert_idx / num_local_experts;           │
+│   local_expert_idx = responsible_expert_idx % num_local_experts;   │
+│ }                                                                   │
+└─────────────────────────────────────────────────────────────────────┘
+
+Line 620: Count tokens from this (local_expert, src_rank) pair
+┌─────────────────────────────────────────────────────────────────────┐
+│ num_recv_tokens = num_recv_tokens_internode + num_recv_tokens_ipc; │
+│ // This is the total tokens from src_rank to local_expert          │
+└─────────────────────────────────────────────────────────────────────┘
+
+Line 656: All sub-warps in warp group process these tokens together
+┌─────────────────────────────────────────────────────────────────────┐
+│ for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) │
+│   // sub_warp 0: token 0, 10, 20, ...                              │
+│   // sub_warp 1: token 1, 11, 21, ...                              │
+│   // ...                                                            │
+│ }                                                                   │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+**Summary:**
+- **Warp Group scope:** one `(local_expert, src_rank)` pair
+- **What it receives:** all tokens that `src_rank` sent to `local_expert`
+- **How it processes:** all sub-warps (warps) in the group work together, dividing tokens by index
+
+### recv_token_begin_idx: Dynamic Slot Allocation
+
+Multiple warp groups (different `src_rank`) write to the same expert's output buffer. They use **atomicAdd** for thread-safe slot allocation:
+
+```cpp
+// Line 444-445 in internode_ll.cu
+recv_token_begin_idx = atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
+```
+
+**How it works:**
+1. `packed_recv_count[local_expert_idx]` is an atomic counter, initialized to 0
+2. `atomicAdd` returns the value **before** adding, and atomically increments the counter
+3. The returned value is the starting index for this warp group's tokens
+
+**Example (Expert 0 receives from 4 src_ranks, in arbitrary order):**
+
+| Order | Warp Group | src_rank | num_tokens | atomicAdd returns | counter after | Write slots |
+|-------|------------|----------|------------|-------------------|---------------|-------------|
+| 1st | B | 2 | 8 | 0 | 8 | [0..7] |
+| 2nd | A | 0 | 5 | 8 | 13 | [8..12] |
+| 3rd | D | 5 | 3 | 13 | 16 | [13..15] |
+| 4th | C | 1 | 4 | 16 | 20 | [16..19] |
+
+**Key points:**
+- **No ordering required**: First-come, first-served. Order depends on network latency and GPU scheduling.
+- **Correctness**: `recv_src_info[i]` records the original token index, so combine phase can send results back correctly.
+- **recv_range**: `recv_range[src_rank] = pack2(num_recv_tokens, recv_token_begin_idx)` records where each src_rank's tokens ended up.
+
+### Receive Flow Detail
+
+```
+1. Grid Sync
+   └── Wait for send phase to complete (makes packed_recv_count visible)
+
+2. Compute buffer pointers
+   └── Each warp group calculates its slice of rdma_recv_x based on (local_expert, src_rank)
+
+3. Poll for token count (sub_warp_id == 1, lane_id == 0 only)
+   └── Spin on rdma_recv_count[local_expert][src_rank] until non-zero
+   └── Decode: num_recv_tokens = -value - 1
+
+4. Allocate output slots (sub_warp_id == 1, lane_id == 0 only)
+   └── recv_token_begin_idx = atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens)
+   └── Store to shared memory for other sub-warps
+
+5. Warp group barrier
+   └── All sub-warps now have num_recv_tokens and recv_token_begin_idx
+
+6. Copy tokens (all sub-warps in parallel)
+   └── for (i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group):
+       ├── Lane 0: copy src_info (4 bytes)
+       ├── All 32 lanes: copy hidden data (~7KB, strided by lane_id)
+       └── All 32 lanes: copy FP8 scales (if applicable)
+```
+
+### Why Pack?
+
+| rdma_recv_x (Source) | packed_recv_x (Destination) |
+|----------------------|-----------------------------|
+| Organized by `[expert][src_rank][slot]` | Organized by `[expert][token]` |
+| Fixed slot size, may have unused slots | Contiguous, no gaps |
+| Scattered across src_ranks | All tokens for one expert together |
+| Not suitable for batched compute | Ready for Expert Forward (batched MLP) |
diff --git a/ep/lam/setup_scripts/setup_lam_conda_local.sh b/ep/lam/setup_scripts/setup_lam_conda_local.sh
new file mode 100755
index 000000000..3760a19fe
--- /dev/null
+++ b/ep/lam/setup_scripts/setup_lam_conda_local.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -e
+
+LAM_DIR="/home/ubuntu/lam"
+ENV_PATH="${LAM_DIR}/uccl_lam_local"
+
+echo "Creating ${LAM_DIR} if it doesn't exist..."
+mkdir -p "$LAM_DIR"
+
+echo "Creating conda env at ${ENV_PATH} with Python 3.10..."
+conda create -p "$ENV_PATH" python=3.10 -y
+
+echo "Activating environment and starting shell..."
+# shellcheck source=/dev/null
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate "$ENV_PATH"
+exec "$SHELL"

From 67af2d450a8ca8fcf4e21e5c57000bcf2afeff23 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:45 -0500
Subject: [PATCH 08/15] the main batching logic

---
 ep/include/ep_config.hpp  |  10 +-
 ep/include/uccl_ibgda.cuh |  10 +-
 ep/src/internode_ll.cu    | 211 +++++++++++++++++++++++++++++++++++++-
 ep/src/rdma.cpp           |   9 +-
 4 files changed, 231 insertions(+), 9 deletions(-)

diff --git a/ep/include/ep_config.hpp b/ep/include/ep_config.hpp
index 92c41fb7f..082fcdf29 100644
--- a/ep/include/ep_config.hpp
+++ b/ep/include/ep_config.hpp
@@ -197,9 +197,15 @@ struct LowLatencyLayout {
 
     // Send buffer
 #ifdef LAM_DEV
-    // Lam: Change the layout to [slot][dst_expert] to enable batched RDMA sends
+    // Lam: Buffer layout for batched RDMA sends:
+    // ┌─────────────────────────────────┬──────────────────────────────────────────────┐
+    // │ Temp buffer (offset 0)          │ RDMA batch buffer (offset num_max_tokens)    │
+    // │ rdma_x[token_idx]               │ rdma_x[num_max_tokens + expert*max + slot]   │
+    // │ Size: num_max_tokens * msg_size │ Size: num_experts * num_max_tokens * msg_size│
+    // └─────────────────────────────────┴──────────────────────────────────────────────┘
+    // Flow: FP8 cast -> temp buffer -> copy to rdma_batch_buffer -> batch RDMA send
     size_t dispatch_send_buffer_bytes =
-        num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+        (num_experts + 1) * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
 #else
     size_t dispatch_send_buffer_bytes =
         num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
diff --git a/ep/include/uccl_ibgda.cuh b/ep/include/uccl_ibgda.cuh
index 41b48d1cd..753a5e63f 100644
--- a/ep/include/uccl_ibgda.cuh
+++ b/ep/include/uccl_ibgda.cuh
@@ -29,7 +29,7 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
     int expert_idx, int lane_id, int message_idx,
     uint64_t const* d2h_channel_addrs, int num_d2h_channel_addrs,
     bool is_combine, int low_latency_buffer_idx = 0, uint64_t atomic_offset = 0,
-    uint64_t atomic_val = 0) {
+    uint64_t atomic_val = 0, int num_tokens = 1) {
   // NOTE(MaoZiming): different from the nvshmemi_ibgda_put_nbi_warp in
   // ibgda_device.cuh, we don't do warp-cooperation.
   if (lane_id != 0) return;
@@ -67,6 +67,10 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
       cmd.atomic_val = atomic_val;
     } else {
       cmd.expert_idx = expert_idx;
+      // Low-latency WRITE: use atomic_val byte for num_tokens (1..255).
+      cmd.atomic_val = (num_tokens <= 0 || num_tokens > 255)
+                           ? 1
+                           : static_cast<uint8_t>(num_tokens);
     }
     h->atomic_set_and_commit(cmd, &slot);
   }
@@ -115,6 +119,10 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
         cmd.atomic_val = atomic_val;
       } else {
         cmd.expert_idx = expert_idx;
+        // Low-latency WRITE: use atomic_val byte for num_tokens (1..255).
+        cmd.atomic_val = (num_tokens <= 0 || num_tokens > 255)
+                             ? 1
+                             : static_cast<uint8_t>(num_tokens);
       }
       h->atomic_set_and_commit(cmd, &slot);
       break;
diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index d08898456..940b0d8b8 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -75,11 +75,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     void* atomic_buffer_ptr = nullptr,
     int64_t* rdma_recv_count_internode = nullptr,
     int* grid_sync_barrier_ptr = nullptr) {
-#ifdef LAM_DEV
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    // printf("[LAM_DEV] dispatch called\n");
-  }
-#endif
+// #ifdef LAM_DEV
+  // if (blockIdx.x == 0 && threadIdx.x == 0) {
+  //   printf("[LAM_DEV] dispatch called\n");
+  // }
+// #endif
   auto const sm_id = static_cast<int>(blockIdx.x);
   auto const thread_id = static_cast<int>(threadIdx.x);
   auto const warp_id = thread_id / WARP_SIZE, lane_id = get_lane_id();
@@ -248,6 +248,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       }
       sync_barrier_1(num_threads);
 
+      // if (warp_id < num_topk && lane_id == 0 && dst_expert_idx < 0) {
+      //   printf("[SKIP] rank=%d token=%d topk_slot=%d expert=-1\n",
+      //          rank, token_idx, warp_id);
+      // }
+
       // Issue IBGDA sends
       if (dst_expert_idx >= 0) {
         int slot_idx =
@@ -270,6 +275,53 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                         dst_rank, max_nvl_peers, 0)
                 : 0;
         if (dst_p2p_ptr == 0) {
+          //  if (lane_id == 0) printf("rank=%d dst_rank=%d dst_expert_idx=%d num_local_experts=%d dst_expert_local_idx=%d\n", rank, dst_rank, dst_expert_idx, num_local_experts, dst_expert_local_idx);
+#ifdef LAM_DEV
+          // Lam: IBGDA -> copy temp to rdma_batch_buffer, batch send later
+          auto const lam_slot = shared_send_slots[warp_id];
+          auto const batch_buf_offset = num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+          auto const batch_buf_ptr = static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+              (dst_expert_idx * num_max_dispatch_tokens_per_rank + lam_slot) * num_bytes_per_msg;
+          auto const* src_int4_ptr = reinterpret_cast<int4 const*>(rdma_x_src_idx);
+          auto* batch_buf_int4_ptr = reinterpret_cast<int4*>(batch_buf_ptr);
+          UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, batch_buf_int4_ptr,
+                             src_int4_ptr, ld_nc_global, st_na_global);
+          // Debug: print copy info (payload[0..3] matches RECV format)
+          // if (lane_id == 0) {
+          //   auto const* payload =
+          //       reinterpret_cast<int const*>(
+          //           reinterpret_cast<uint8_t const*>(rdma_x_src_idx) + sizeof(int4));
+          //   printf("[IBGDA] rank=%d token=%d expert=%d slot=%d [IBGDA COPY LAM] "
+          //          "from=%p to=%p payload[0..3]=%08x %08x %08x %08x\n",
+          //          rank, token_idx, dst_expert_idx, lam_slot,
+          //          (void*)rdma_x_src_idx, (void*)batch_buf_ptr,
+          //          payload[0], payload[1], payload[2], payload[3]);
+          // }
+#else
+          // Debug: print send info (payload[0..3] matches RECV format)
+          // if (lane_id == 0) {
+          //   auto const* payload =
+          //       reinterpret_cast<int const*>(
+          //           reinterpret_cast<uint8_t const*>(src_ptr) + sizeof(int4));
+          //   printf("[IBGDA] rank=%d token=%d expert=%d slot=%d [IBGDA SEND] "
+          //          "from=%p to=0x%lx dst_rank=%d payload[0..3]=%08x %08x %08x %08x\n",
+          //          rank, token_idx, dst_expert_idx, slot_idx,
+          //          (void*)src_ptr, dst_ptr, dst_rank,
+          //          payload[0], payload[1], payload[2], payload[3]);
+          // }
+          // if (lane_id == 0) {
+            //   printf(
+              //     "[IBGDA PARAMS] rank=%d "
+              //     "num_bytes_per_msg=%lu dst_rank=%d dst_expert_local_idx=%d "
+              //     "lane_id=%d slot_idx=%d d2h_channel_addrs=%p num_d2h_channel_addrs=%d "
+              //     "low_latency_buffer_idx=%d\n",
+              //     rank,
+              //     (unsigned long)num_bytes_per_msg, dst_rank,
+              //     dst_expert_local_idx, lane_id, slot_idx, d2h_channel_addrs,
+              //     num_d2h_channel_addrs,
+              //     low_latency_buffer_idx
+              //   );
+              // }
           __threadfence_system();
           uccl::nvshmemi_ibgda_put_nbi_warp(
               dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
@@ -279,12 +331,24 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                                  // rb.
               lane_id, slot_idx, d2h_channel_addrs, num_d2h_channel_addrs,
               false, low_latency_buffer_idx);
+#endif
         } else {
           // Intra-node: use direct memory copy via IPC
           auto const* src_int4_ptr = reinterpret_cast<int4 const*>(src_ptr);
           auto* dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
           UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr,
                              src_int4_ptr, ld_nc_global, st_na_global);
+          // Debug: print IPC copy info (payload data[0..3] matches RECV format)
+          // if (lane_id == 0) {
+          //   auto const* payload =
+          //       reinterpret_cast<int const*>(
+          //           reinterpret_cast<uint8_t const*>(src_ptr) + sizeof(int4));
+          //   printf("[IPC] rank=%d token=%d expert=%d slot=%d [IPC COPY] "
+          //          "from=%p to=%p dst_rank=%d payload[0..3]=%08x %08x %08x %08x\n",
+          //          rank, token_idx, dst_expert_idx, slot_idx,
+          //          (void*)src_ptr, (void*)dst_p2p_ptr, dst_rank,
+          //          payload[0], payload[1], payload[2], payload[3]);
+          // }
         }
         // Increase counter after finishing
         __syncwarp();
@@ -337,6 +401,114 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     }
   }
   __syncthreads();
+
+#ifdef LAM_DEV
+  // Lam: Grid-wide sync before batch send phase.
+  // __syncthreads() only syncs within a single thread block (SM).
+  // The token loop distributes tokens round-robin across SMs
+  // (token_idx = sm_id, stepping by num_sms). When num_tokens < num_sms,
+  // most SMs skip the token loop and pass __syncthreads() immediately,
+  // while the SMs processing tokens are still writing to the batch buffer
+  // and incrementing atomic_send_counter_per_expert.
+  // Without grid sync, the batch send phase can read a stale/partial
+  // counter and send fewer tokens than actually produced, causing the
+  // receiver to hang waiting for data that never arrives.
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+  amd::grid_sync(grid_sync_barrier_ptr, num_sms);
+#else
+  cg::this_grid().sync();
+#endif
+
+  // Lam: Batch RDMA send phase - send entire expert buffer in ONE IBGDA call
+  // Each warp group handles one expert (only first sub_warp does the send)
+  if (responsible_expert_idx < num_experts && sub_warp_id == 0) {
+    auto const dst_rank = responsible_expert_idx / num_local_experts;
+    auto const dst_expert_local_idx = responsible_expert_idx % num_local_experts;
+
+    // Check if this destination is inter-node (needs IBGDA batch send)
+    // IPC destinations were already sent in the token loop
+    auto const test_dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x);
+    auto const dst_p2p_ptr =
+        ipc_rdma_base_ptrs
+            ? uccl::get_ipc_p2p_ptr(test_dst_ptr, ipc_rdma_base_ptrs, rank,
+                                    dst_rank, max_nvl_peers, 0)
+            : 0;
+
+    if (dst_p2p_ptr == 0) {
+      // Inter-node: batch send ALL tokens for this expert in ONE call
+      auto const num_tokens_to_send =
+          atomic_send_counter_per_expert[responsible_expert_idx];
+
+      if (num_tokens_to_send > 0) {
+        auto const batch_buf_offset =
+            num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        // Source: start of this expert's batch buffer (contiguous)
+        auto const batch_buf_ptr =
+            static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+            responsible_expert_idx * num_max_dispatch_tokens_per_rank *
+                num_bytes_per_msg;
+        auto const src_ptr = reinterpret_cast<uint64_t>(batch_buf_ptr);
+        // Destination: start of this expert's recv buffer on remote rank
+        auto const dst_ptr =
+            reinterpret_cast<uint64_t>(rdma_recv_x) +
+            dst_expert_local_idx * num_ranks *
+                num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
+            rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        // Total bytes: all tokens for this expert
+        auto const total_bytes = num_tokens_to_send * num_bytes_per_msg;
+
+        // Debug: print batch send info
+        // if (lane_id == 0) {
+        //   printf("[LAM_DEV] rank=%d dst_rank=%d local_expert=%d [IBGDA BATCH SEND] "
+        //          "from=%p to=0x%lx num_tokens=%d total_bytes=%d\n",
+        //          rank, dst_rank, dst_expert_local_idx,
+        //          (void*)src_ptr, dst_ptr,
+        //          num_tokens_to_send, static_cast<int>(total_bytes));
+        // }
+        __threadfence_system();
+
+        // if (lane_id == 0) {
+        //   printf("[LAM_DEV] rank=%d dst_rank=%d local_expert=%d [BEFORE IBGDA BATCH SEND]\n",
+        //          rank, dst_rank, dst_expert_local_idx);
+        // }
+        // if (lane_id == 0) {
+        //   auto const* payload =
+        //       reinterpret_cast<int const*>(
+        //           reinterpret_cast<uint8_t const*>(src_ptr) + sizeof(int4));
+        //   printf(
+        //       "[IBGDA PARAMS] rank=%d "
+        //       "num_bytes_per_msg=%lu dst_rank=%d dst_expert_local_idx=%d "
+        //       "lane_id=%d slot_idx=%d d2h_channel_addrs=%p num_d2h_channel_addrs=%d "
+        //       "low_latency_buffer_idx=%d, num_tokens_to_send=%d\n",
+        //       rank,
+        //       (unsigned long)total_bytes, dst_rank,
+        //       dst_expert_local_idx, lane_id, 0,
+        //       d2h_channel_addrs, num_d2h_channel_addrs,
+        //       low_latency_buffer_idx, num_tokens_to_send
+        //   );                  
+        // }
+        uccl::nvshmemi_ibgda_put_nbi_warp(
+            dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
+            src_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
+            total_bytes, dst_rank,
+            /*warp_id=*/dst_expert_local_idx,  // NOTE(Yang): for selecting rb.
+            lane_id, /*slot=*/0, d2h_channel_addrs, num_d2h_channel_addrs,
+            false, low_latency_buffer_idx, 0, 0, num_tokens_to_send);
+        
+        // Debug: print after send
+        // if (lane_id == 0) {
+        //   printf("[LAM_DEV] rank=%d dst_rank=%d local_expert=%d [IBGDA BATCH SEND DONE]\n",
+        //          rank, dst_rank, dst_expert_local_idx);
+        // }
+      }
+    }
+    // IPC: already sent in the token loop, nothing to do here
+  }
+
+  __threadfence_system();  // Ensure batch sends are visible before count sends
+
+#endif  // LAM_DEV batch send
+
   // Issue count sends
   if (responsible_expert_idx < num_experts and sub_warp_id == 0 and
       lane_id == 0) {
@@ -359,6 +531,9 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
         rdma_recv_count + dst_expert_local_idx * num_ranks + rank);
     auto dst_ptr_internode = reinterpret_cast<uint64_t>(
         rdma_recv_count_internode + dst_expert_local_idx * num_ranks + rank);
+    // printf("dst_ptr_internode: %llu (rank=%d dst_rank=%d dst_expert_local_idx=%d)\n",
+    //        static_cast<unsigned long long>(dst_ptr_internode), rank, dst_rank,
+    //        dst_expert_local_idx);
     // Try to use IPC for intra-node atomic operations
     auto const dst_p2p_ptr =
         ipc_rdma_base_ptrs
@@ -366,6 +541,13 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                     max_nvl_peers, 0)
             : 0;
     if (dst_p2p_ptr == 0) {
+      // printf(
+      //     "nvshmemi_ibgda_amo_nonfetch_add: dst_ptr_internode=%llu "
+      //     "before=N/A(remote) after=N/A(non-fetch) add_val=%d dst_rank=%d "
+      //     "warp_id=%d num_d2h_channel_addrs=%d low_latency_buffer_idx=%d\n",
+      //     static_cast<unsigned long long>(dst_ptr_internode),
+      //     -num_tokens_sent - 1, dst_rank, dst_expert_local_idx,
+      //     num_d2h_channel_addrs, low_latency_buffer_idx);
       // Inter-node or no IPC: use IBGDA atomic
       uccl::nvshmemi_ibgda_amo_nonfetch_add(
           dst_ptr_internode, reinterpret_cast<uint64_t>(atomic_buffer_ptr),
@@ -530,6 +712,14 @@ LOW_LATENCY_DISPATCH_RECV:
         atomicAdd(reinterpret_cast<unsigned long long*>(
                       dispatch_wait_recv_cost_stats + src_rank),
                   wait_recv_cost);
+      // Debug: print recv summary (sub_warp 1, lane 0 only - where internode/ipc are known)
+      // if (num_recv_tokens > 0) {
+      //   printf("[RECV SUMMARY] rank=%d local_expert=%d src_rank=%d "
+      //          "num_tokens=%d (internode=%d ipc=%d) begin_idx=%d\n",
+      //          rank, local_expert_idx, src_rank,
+      //          num_recv_tokens, num_recv_tokens_internode, num_recv_tokens_ipc,
+      //          recv_token_begin_idx);
+      // }
     }
     sync_barrier<true>(warp_group_id + 2, num_warps_per_group * WARP_SIZE);
 
@@ -556,6 +746,17 @@ LOW_LATENCY_DISPATCH_RECV:
           reinterpret_cast<uint8_t*>(src_src_idx) + sizeof(int4));
       auto const dst_data =
           recv_x_int4 + (recv_token_begin_idx + i) * hidden_int4;
+      // Debug: print per-token recv info
+      // Note: src_local_token_idx is the token index on the SENDER's side (not global)
+      // if (lane_id == 0) {
+      //   auto const src_local_token_idx = ld_nc_global(src_src_idx);
+      //   auto const* data = reinterpret_cast<int const*>(src_data);
+      //   printf("[RECV] rank=%d local_expert=%d src_rank=%d slot=%d "
+      //          "from=%p to=%p src_local_token_idx=%d data[0..3]=%08x %08x %08x %08x\n",
+      //          rank, local_expert_idx, src_rank, i,
+      //          (void*)src_data, (void*)dst_data, src_local_token_idx,
+      //          data[0], data[1], data[2], data[3]);
+      // }
       UNROLLED_WARP_COPY(7, lane_id, hidden_int4, dst_data, src_data,
                          ld_nc_global, st_na_global);
 
diff --git a/ep/src/rdma.cpp b/ep/src/rdma.cpp
index 8874509ef..273b4b7b3 100644
--- a/ep/src/rdma.cpp
+++ b/ep/src/rdma.cpp
@@ -1207,9 +1207,16 @@ static void post_rdma_async_batched_fast_mode(
                                                get_low_latency(cmd.cmd_type)};
 #endif
 #ifdef USE_RECEIVER_BARRIER
+        // Lam: Low-latency: num_tokens from cmd.atomic_val (GPU sets it); else 1.
+        // Use atomic_val whenever GPU set it (non-zero), not only when
+        // get_low_latency(cmd_type); dispatch can also set atomic_val.
+        uint32_t num_tokens_imm =
+            cmd.atomic_val ? static_cast<uint32_t>(cmd.atomic_val) : 1u;
+        // get_is_combine(cmd.cmd_type) ? printf("Receiving this combine imm? num_tokens_imm: %d, cmd.atomic_val: %d\n", num_tokens_imm, cmd.atomic_val) : printf("Receiving this dispatch imm? num_tokens_imm: %d, cmd.atomic_val: %d\n", num_tokens_imm, cmd.atomic_val);
+        // fflush(stdout);
         uint32_t imm = WriteImm::Pack(get_is_combine(cmd.cmd_type),
                                       get_low_latency(cmd.cmd_type),
-                                      cmd.expert_idx, 1, my_rank)
+                                      cmd.expert_idx, num_tokens_imm, my_rank)
                            .GetImmData();
         ibv_wr_rdma_write_imm(qpx, ctx->remote_rkey, remote_addr, htonl(imm));
 #else

From af9dcea32a5e0a39be766e4917ed822fea76a606 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:45 -0500
Subject: [PATCH 09/15] add distributed launch script

---
 ep/bench/launch_distributed_ll.sh | 17 ++++++++++-------
 ep/bench/test_low_latency.py      |  6 +++++-
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/ep/bench/launch_distributed_ll.sh b/ep/bench/launch_distributed_ll.sh
index 3138536c5..bb868e0cd 100755
--- a/ep/bench/launch_distributed_ll.sh
+++ b/ep/bench/launch_distributed_ll.sh
@@ -11,12 +11,12 @@
 #
 # Options:
 #   --docker    Run inside Docker container (default container: lam_rocm)
-#   --conda     Run inside conda environment (default: /home/ubuntu/lam/conda_lam_local)
+#   --conda     Run inside conda environment (default: /home/ubuntu/lam/uccl_lam_local)
 #
 # Environment variables:
 #   MASTER_PORT   - Master port (default: random 29500-30499)
 #   CONTAINER     - Docker container name (default: lam_rocm, only for --docker)
-#   CONDA_ENV     - Conda environment path (default: /home/ubuntu/lam/conda_lam_local, only for --conda)
+#   CONDA_ENV     - Conda environment path (default: /home/ubuntu/lam/uccl_lam_local, only for --conda)
 #   NUM_TOKENS    - Number of tokens (default: 128)
 #   HIDDEN        - Hidden size (default: 7168)
 #   NUM_TOPK      - Top-k value (default: 8)
@@ -52,8 +52,8 @@ done
 
 # Node IPs (in order of node_rank)
 NODES=(
-    "172.31.24.178"
-    "172.31.20.184"
+    "172.31.16.74"   # master
+    "172.31.18.162"
 )
 
 # Configuration - MASTER_ADDR automatically set to first node
@@ -66,7 +66,7 @@ NPROC_PER_NODE=8
 CONTAINER="${CONTAINER:-lam_rocm}"
 
 # Conda environment path
-CONDA_ENV="${CONDA_ENV:-/home/ubuntu/lam/conda_lam_local}"
+CONDA_ENV="${CONDA_ENV:-/home/ubuntu/lam/uccl_lam_local}"
 
 # Kill command - kills processes based on mode
 if [ "$1" == "kill" ]; then
@@ -200,6 +200,8 @@ export UCCL_LOG_DIR=${RUN_LOG_DIR}"
             --num-experts=${NUM_EXPERTS} \
             --stop-after-first"
         local exec_cmd="docker exec ${CONTAINER} bash -c '${env_vars} && cd ${SCRIPT_DIR} && ${torchrun_cmd}'"
+        echo "[$(date '+%H:%M:%S')] Command to launch on node ${node_rank}:"
+        echo "${exec_cmd}"
         ssh -o StrictHostKeyChecking=no \
             -o ServerAliveInterval=30 \
             -o ServerAliveCountMax=10 \
@@ -217,8 +219,9 @@ export UCCL_LOG_DIR=${RUN_LOG_DIR}"
             --num-tokens=${NUM_TOKENS} \
             --hidden=${HIDDEN} \
             --num-topk=${NUM_TOPK} \
-            --num-experts=${NUM_EXPERTS} \
-            --stop-after-first"
+            --num-experts=${NUM_EXPERTS}"
+        echo "[$(date '+%H:%M:%S')] Command to launch on node ${node_rank}:"
+        echo "${conda_cmd}"
         ssh -o StrictHostKeyChecking=no \
             -o ServerAliveInterval=30 \
             -o ServerAliveCountMax=10 \
diff --git a/ep/bench/test_low_latency.py b/ep/bench/test_low_latency.py
index 5c266a5ad..8d5d50546 100644
--- a/ep/bench/test_low_latency.py
+++ b/ep/bench/test_low_latency.py
@@ -245,7 +245,7 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                 cumulative_local_expert_recv_stats = torch.zeros(
                                     (num_local_experts,), dtype=torch.int, device="cuda"
                                 )
-                                print(f"[python] dispatch called", flush=True)
+                                # print(f"[python] dispatch called", flush=True)
                                 (
                                     packed_recv_x,
                                     packed_recv_count,
@@ -489,6 +489,8 @@ def test_func(return_recv_hook: bool):
             return_recv_hook=return_recv_hook,
         )
         large_gemm_with_hook(hook) if return_recv_hook else None
+        # Phase alignment: ensure all ranks finish dispatch before any starts combine
+        # dist.barrier(group=group)
         combined_x, event, hook = buffer.low_latency_combine(
             simulated_gemm_x,
             topk_idx,
@@ -498,6 +500,8 @@ def test_func(return_recv_hook: bool):
             return_recv_hook=return_recv_hook,
         )
         large_gemm_with_hook(hook) if return_recv_hook else None
+        # Phase alignment: ensure all ranks finish dispatch before any starts combine
+        # dist.barrier(group=group)
 
     print("✓ All correctness tests passed!", flush=True)
 

From c80a6c5dd65d58048103f1393b77d0fcb901201c Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:46 -0500
Subject: [PATCH 10/15] remove useless comments

---
 ep/src/internode_ll.cu | 161 -----------------------------------------
 1 file changed, 161 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 940b0d8b8..3d74672a8 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -175,24 +175,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       // Sync to make shared_send_slots visible to all threads
       sync_barrier_1((num_warps - 1) * WARP_SIZE);
 
-      // // Lam: Debug print: verify send slot allocation (only first SM, first few tokens)
-      // // Uses spinlock to prevent interleaved output from different threads
-      // if (thread_id == 0) {
-      //   // Acquire print lock
-      //   while (atomicCAS(&g_print_lock, 0, 1) != 0);
-      //   __threadfence();
-
-      //   printf("[LAM_DEV] rank=%d token_idx=%d send_slots: ", rank, token_idx);
-      //   for (int k = 0; k < num_topk; k++) {
-      //     printf("(expert=%d, slot=%d) ", shared_dst_experts[k],
-      //            shared_send_slots[k]);
-      //   }
-      //   printf("\n");
-
-      //   __threadfence();
-      //   atomicExch(&g_print_lock, 0);  // Release print lock
-      // }
-      // sync_barrier_1((num_warps - 1) * WARP_SIZE);
 #endif  // LAM_DEV (slot allocation + debug print)
 
 // FP8 cast
@@ -248,11 +230,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       }
       sync_barrier_1(num_threads);
 
-      // if (warp_id < num_topk && lane_id == 0 && dst_expert_idx < 0) {
-      //   printf("[SKIP] rank=%d token=%d topk_slot=%d expert=-1\n",
-      //          rank, token_idx, warp_id);
-      // }
-
       // Issue IBGDA sends
       if (dst_expert_idx >= 0) {
         int slot_idx =
@@ -275,7 +252,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                         dst_rank, max_nvl_peers, 0)
                 : 0;
         if (dst_p2p_ptr == 0) {
-          //  if (lane_id == 0) printf("rank=%d dst_rank=%d dst_expert_idx=%d num_local_experts=%d dst_expert_local_idx=%d\n", rank, dst_rank, dst_expert_idx, num_local_experts, dst_expert_local_idx);
 #ifdef LAM_DEV
           // Lam: IBGDA -> copy temp to rdma_batch_buffer, batch send later
           auto const lam_slot = shared_send_slots[warp_id];
@@ -286,42 +262,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
           auto* batch_buf_int4_ptr = reinterpret_cast<int4*>(batch_buf_ptr);
           UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, batch_buf_int4_ptr,
                              src_int4_ptr, ld_nc_global, st_na_global);
-          // Debug: print copy info (payload[0..3] matches RECV format)
-          // if (lane_id == 0) {
-          //   auto const* payload =
-          //       reinterpret_cast<int const*>(
-          //           reinterpret_cast<uint8_t const*>(rdma_x_src_idx) + sizeof(int4));
-          //   printf("[IBGDA] rank=%d token=%d expert=%d slot=%d [IBGDA COPY LAM] "
-          //          "from=%p to=%p payload[0..3]=%08x %08x %08x %08x\n",
-          //          rank, token_idx, dst_expert_idx, lam_slot,
-          //          (void*)rdma_x_src_idx, (void*)batch_buf_ptr,
-          //          payload[0], payload[1], payload[2], payload[3]);
-          // }
 #else
-          // Debug: print send info (payload[0..3] matches RECV format)
-          // if (lane_id == 0) {
-          //   auto const* payload =
-          //       reinterpret_cast<int const*>(
-          //           reinterpret_cast<uint8_t const*>(src_ptr) + sizeof(int4));
-          //   printf("[IBGDA] rank=%d token=%d expert=%d slot=%d [IBGDA SEND] "
-          //          "from=%p to=0x%lx dst_rank=%d payload[0..3]=%08x %08x %08x %08x\n",
-          //          rank, token_idx, dst_expert_idx, slot_idx,
-          //          (void*)src_ptr, dst_ptr, dst_rank,
-          //          payload[0], payload[1], payload[2], payload[3]);
-          // }
-          // if (lane_id == 0) {
-            //   printf(
-              //     "[IBGDA PARAMS] rank=%d "
-              //     "num_bytes_per_msg=%lu dst_rank=%d dst_expert_local_idx=%d "
-              //     "lane_id=%d slot_idx=%d d2h_channel_addrs=%p num_d2h_channel_addrs=%d "
-              //     "low_latency_buffer_idx=%d\n",
-              //     rank,
-              //     (unsigned long)num_bytes_per_msg, dst_rank,
-              //     dst_expert_local_idx, lane_id, slot_idx, d2h_channel_addrs,
-              //     num_d2h_channel_addrs,
-              //     low_latency_buffer_idx
-              //   );
-              // }
           __threadfence_system();
           uccl::nvshmemi_ibgda_put_nbi_warp(
               dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
@@ -338,17 +279,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
           auto* dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
           UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr,
                              src_int4_ptr, ld_nc_global, st_na_global);
-          // Debug: print IPC copy info (payload data[0..3] matches RECV format)
-          // if (lane_id == 0) {
-          //   auto const* payload =
-          //       reinterpret_cast<int const*>(
-          //           reinterpret_cast<uint8_t const*>(src_ptr) + sizeof(int4));
-          //   printf("[IPC] rank=%d token=%d expert=%d slot=%d [IPC COPY] "
-          //          "from=%p to=%p dst_rank=%d payload[0..3]=%08x %08x %08x %08x\n",
-          //          rank, token_idx, dst_expert_idx, slot_idx,
-          //          (void*)src_ptr, (void*)dst_p2p_ptr, dst_rank,
-          //          payload[0], payload[1], payload[2], payload[3]);
-          // }
         }
         // Increase counter after finishing
         __syncwarp();
@@ -457,36 +387,8 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
         // Total bytes: all tokens for this expert
         auto const total_bytes = num_tokens_to_send * num_bytes_per_msg;
 
-        // Debug: print batch send info
-        // if (lane_id == 0) {
-        //   printf("[LAM_DEV] rank=%d dst_rank=%d local_expert=%d [IBGDA BATCH SEND] "
-        //          "from=%p to=0x%lx num_tokens=%d total_bytes=%d\n",
-        //          rank, dst_rank, dst_expert_local_idx,
-        //          (void*)src_ptr, dst_ptr,
-        //          num_tokens_to_send, static_cast<int>(total_bytes));
-        // }
         __threadfence_system();
 
-        // if (lane_id == 0) {
-        //   printf("[LAM_DEV] rank=%d dst_rank=%d local_expert=%d [BEFORE IBGDA BATCH SEND]\n",
-        //          rank, dst_rank, dst_expert_local_idx);
-        // }
-        // if (lane_id == 0) {
-        //   auto const* payload =
-        //       reinterpret_cast<int const*>(
-        //           reinterpret_cast<uint8_t const*>(src_ptr) + sizeof(int4));
-        //   printf(
-        //       "[IBGDA PARAMS] rank=%d "
-        //       "num_bytes_per_msg=%lu dst_rank=%d dst_expert_local_idx=%d "
-        //       "lane_id=%d slot_idx=%d d2h_channel_addrs=%p num_d2h_channel_addrs=%d "
-        //       "low_latency_buffer_idx=%d, num_tokens_to_send=%d\n",
-        //       rank,
-        //       (unsigned long)total_bytes, dst_rank,
-        //       dst_expert_local_idx, lane_id, 0,
-        //       d2h_channel_addrs, num_d2h_channel_addrs,
-        //       low_latency_buffer_idx, num_tokens_to_send
-        //   );                  
-        // }
         uccl::nvshmemi_ibgda_put_nbi_warp(
             dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
             src_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
@@ -495,11 +397,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
             lane_id, /*slot=*/0, d2h_channel_addrs, num_d2h_channel_addrs,
             false, low_latency_buffer_idx, 0, 0, num_tokens_to_send);
         
-        // Debug: print after send
-        // if (lane_id == 0) {
-        //   printf("[LAM_DEV] rank=%d dst_rank=%d local_expert=%d [IBGDA BATCH SEND DONE]\n",
-        //          rank, dst_rank, dst_expert_local_idx);
-        // }
       }
     }
     // IPC: already sent in the token loop, nothing to do here
@@ -531,9 +428,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
         rdma_recv_count + dst_expert_local_idx * num_ranks + rank);
     auto dst_ptr_internode = reinterpret_cast<uint64_t>(
         rdma_recv_count_internode + dst_expert_local_idx * num_ranks + rank);
-    // printf("dst_ptr_internode: %llu (rank=%d dst_rank=%d dst_expert_local_idx=%d)\n",
-    //        static_cast<unsigned long long>(dst_ptr_internode), rank, dst_rank,
-    //        dst_expert_local_idx);
     // Try to use IPC for intra-node atomic operations
     auto const dst_p2p_ptr =
         ipc_rdma_base_ptrs
@@ -541,13 +435,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                     max_nvl_peers, 0)
             : 0;
     if (dst_p2p_ptr == 0) {
-      // printf(
-      //     "nvshmemi_ibgda_amo_nonfetch_add: dst_ptr_internode=%llu "
-      //     "before=N/A(remote) after=N/A(non-fetch) add_val=%d dst_rank=%d "
-      //     "warp_id=%d num_d2h_channel_addrs=%d low_latency_buffer_idx=%d\n",
-      //     static_cast<unsigned long long>(dst_ptr_internode),
-      //     -num_tokens_sent - 1, dst_rank, dst_expert_local_idx,
-      //     num_d2h_channel_addrs, low_latency_buffer_idx);
       // Inter-node or no IPC: use IBGDA atomic
       uccl::nvshmemi_ibgda_amo_nonfetch_add(
           dst_ptr_internode, reinterpret_cast<uint64_t>(atomic_buffer_ptr),
@@ -561,21 +448,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                             -num_tokens_sent - 1);
     }
 
-#ifdef LAM_DEV
-    // Lam: Debug print: verify send counter matches expected count
-    // {
-    //   auto const send_counter_val =
-    //       atomic_send_counter_per_expert[responsible_expert_idx];
-    //   if (rank == 0 && responsible_expert_idx < 8) {
-    //     printf(
-    //         "[LAM_DEV] rank=%d expert=%d: expected_count=%d, send_counter=%d, "
-    //         "match=%s\n",
-    //         rank, responsible_expert_idx, num_tokens_sent, send_counter_val,
-    //         (num_tokens_sent == send_counter_val) ? "YES" : "NO");
-    //   }
-    // }
-#endif
-
     // Clean workspace for next use
     atomic_counter_per_expert[responsible_expert_idx] = 0;
     atomic_finish_counter_per_expert[responsible_expert_idx] = 0;
@@ -690,13 +562,6 @@ LOW_LATENCY_DISPATCH_RECV:
           num_recv_tokens_internode != 0 ? -num_recv_tokens_internode - 1 : 0;
       num_recv_tokens_ipc =
           num_recv_tokens_ipc != 0 ? -num_recv_tokens_ipc - 1 : 0;
-      // printf(
-      //     "num_recv_tokens_internode: %d, num_recv_tokens_ipc: %d, src_rank:"
-      //     "%d, rank: %d, max_nvl_peers: %d, responsible_expert_idx: %d,"
-      //     "num_experts: %d, num_local_experts: %d\n",
-      //     num_recv_tokens_internode, num_recv_tokens_ipc, src_rank, rank,
-      //     max_nvl_peers, responsible_expert_idx, num_experts,
-      //     num_local_experts);
       num_recv_tokens = num_recv_tokens_internode + num_recv_tokens_ipc;
       recv_token_begin_idx =
           atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
@@ -712,14 +577,6 @@ LOW_LATENCY_DISPATCH_RECV:
         atomicAdd(reinterpret_cast<unsigned long long*>(
                       dispatch_wait_recv_cost_stats + src_rank),
                   wait_recv_cost);
-      // Debug: print recv summary (sub_warp 1, lane 0 only - where internode/ipc are known)
-      // if (num_recv_tokens > 0) {
-      //   printf("[RECV SUMMARY] rank=%d local_expert=%d src_rank=%d "
-      //          "num_tokens=%d (internode=%d ipc=%d) begin_idx=%d\n",
-      //          rank, local_expert_idx, src_rank,
-      //          num_recv_tokens, num_recv_tokens_internode, num_recv_tokens_ipc,
-      //          recv_token_begin_idx);
-      // }
     }
     sync_barrier<true>(warp_group_id + 2, num_warps_per_group * WARP_SIZE);
 
@@ -746,17 +603,6 @@ LOW_LATENCY_DISPATCH_RECV:
           reinterpret_cast<uint8_t*>(src_src_idx) + sizeof(int4));
       auto const dst_data =
           recv_x_int4 + (recv_token_begin_idx + i) * hidden_int4;
-      // Debug: print per-token recv info
-      // Note: src_local_token_idx is the token index on the SENDER's side (not global)
-      // if (lane_id == 0) {
-      //   auto const src_local_token_idx = ld_nc_global(src_src_idx);
-      //   auto const* data = reinterpret_cast<int const*>(src_data);
-      //   printf("[RECV] rank=%d local_expert=%d src_rank=%d slot=%d "
-      //          "from=%p to=%p src_local_token_idx=%d data[0..3]=%08x %08x %08x %08x\n",
-      //          rank, local_expert_idx, src_rank, i,
-      //          (void*)src_data, (void*)dst_data, src_local_token_idx,
-      //          data[0], data[1], data[2], data[3]);
-      // }
       UNROLLED_WARP_COPY(7, lane_id, hidden_int4, dst_data, src_data,
                          ld_nc_global, st_na_global);
 
@@ -791,8 +637,6 @@ LOW_LATENCY_DISPATCH_RECV:
         }
       }
     }
-    // if (blockIdx.x == 0 && threadIdx.x == 0)
-    //   printf("[dispatch] RECV finished\n");
   }
 }
 
@@ -1223,8 +1067,6 @@ __global__ __launch_bounds__(1024, 1) void combine(
 // Receiving phase
 LOW_LATENCY_COMBINE_RECV:
   if ((phases & LOW_LATENCY_RECV_PHASE) == 0) {
-    // if (blockIdx.x == 0 && threadIdx.x == 0)
-    //   printf("[combine] SEND finished\n");
     return;
   }
   // Wait all ranks to arrive
@@ -1337,8 +1179,6 @@ LOW_LATENCY_COMBINE_RECV:
        token_idx * hidden_bf16_int4)[hidden_idx] = combined_int4;
     }
 
-    // if (blockIdx.x == 0 && threadIdx.x == 0)
-    //   printf("[combine] RECV finished\n");
   }
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
@@ -1380,7 +1220,6 @@ void combine(void* combined_x, void* rdma_recv_x, int* rdma_recv_flag,
 
   constexpr int kNumTMABytesPerWarp = 12 * (512 + 16);
   int const smem_size = kNumTMABytesPerWarp * num_warps;
-  // printf("Combine launched\n");
 
 #define COMBINE_LAUNCH_CASE(hidden)                                          \
   {                                                                          \

From 922a0abfae56afcd8caea7a934c15851f5a8ab46 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:01:46 -0500
Subject: [PATCH 11/15] support the use of fp8 conversion or not

---
 ep/bench/test_low_latency.py | 49 ++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/ep/bench/test_low_latency.py b/ep/bench/test_low_latency.py
index 8d5d50546..1064fda05 100644
--- a/ep/bench/test_low_latency.py
+++ b/ep/bench/test_low_latency.py
@@ -145,6 +145,7 @@ def test_main(
     group: dist.ProcessGroup,
     buffer: Buffer,
     use_logfmt: bool = False,
+    dispatch_use_fp8: bool = True,
     seed: int = 0,
     skip_benchmark: bool = False,
     debug_hash: bool = False,
@@ -218,20 +219,20 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
         for return_recv_hook in (False, True):
             if stop_now:
                 break
-            for dispatch_use_fp8 in (False, True):
+            for dispatch_use_fp8_case in (False, True):
                 if stop_now:
                     break
                 for round_scale in (False,):
                     if stop_now:
                         break
-                    for round_scale in (False, True) if dispatch_use_fp8 else (False,):
+                    for round_scale in (False, True) if dispatch_use_fp8_case else (False,):
                         if stop_now:
                             break
                         for use_ue8m0 in (False, True) if round_scale else (False,):
                             print(
                                 "Start experiment with settings:"
                                 f" return_recv_hook={return_recv_hook}"
-                                f" dispatch_use_fp8={dispatch_use_fp8}"
+                                f" dispatch_use_fp8={dispatch_use_fp8_case}"
                                 f" round_scale={round_scale}"
                                 f" use_ue8m0={use_ue8m0}",
                                 flush=True,
@@ -257,7 +258,7 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                     topk_idx,
                                     num_tokens,
                                     num_experts,
-                                    use_fp8=dispatch_use_fp8,
+                                    use_fp8=dispatch_use_fp8_case,
                                     round_scale=round_scale,
                                     use_ue8m0=use_ue8m0,
                                     cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
@@ -271,7 +272,7 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                 )
                             packed_recv_x = (
                                 (packed_recv_x[0], packed_recv_x[1].contiguous())
-                                if dispatch_use_fp8
+                                if dispatch_use_fp8_case
                                 else packed_recv_x
                             )
                             simulated_gemm_x = (
@@ -279,7 +280,7 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                     packed_recv_x[0].view(-1, hidden),
                                     packed_recv_x[1].view(-1, hidden // 128),
                                 ).view(packed_recv_x[0].shape)
-                                if dispatch_use_fp8
+                                if dispatch_use_fp8_case
                                 else packed_recv_x.clone()
                             )
                             all_topk_idx = torch.empty(
@@ -296,7 +297,7 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                     per_token_cast_back(
                                         packed_recv_x[0][i], packed_recv_x[1][i]
                                     )
-                                    if dispatch_use_fp8
+                                    if dispatch_use_fp8_case
                                     else packed_recv_x[i]
                                 )
                                 recv_count, recv_src_info, recv_layout_range = (
@@ -361,11 +362,11 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                                 - j
                                                 + rank_offset
                                             ).sum().item() == 0
-                                if dispatch_use_fp8:
+                                if dispatch_use_fp8_case:
                                     tag = (
                                         f"x={'x' if current_x is x else 'rand'}"
                                         f"|hook={return_recv_hook}"
-                                        f"|fp8={dispatch_use_fp8}"
+                                        f"|fp8={dispatch_use_fp8_case}"
                                         f"|rs={round_scale}"
                                         f"|ue={use_ue8m0}"
                                         f"|le={i}"
@@ -383,7 +384,7 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                     tag = (
                                         f"x={'x' if current_x is x else 'rand'}"
                                         f"|hook={return_recv_hook}"
-                                        f"|fp8={dispatch_use_fp8}"
+                                        f"|fp8={dispatch_use_fp8_case}"
                                         f"|rs={round_scale}"
                                         f"|ue={use_ue8m0}"
                                         f"|le={i}"
@@ -445,12 +446,12 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
                                     )
                                     assert torch.isnan(combined_x).sum().item() == 0
                                     assert diff < (
-                                        9e-4 if dispatch_use_fp8 else 1e-5
-                                    ), f"Error: {diff=}, {dispatch_use_fp8=}, {zero_copy=}"
+                                        9e-4 if dispatch_use_fp8_case else 1e-5
+                                    ), f"Error: {diff=}, {dispatch_use_fp8_case=}, {zero_copy=}"
                                     tag = (
                                         f"x={'x' if current_x is x else 'rand'}"
                                         f"|hook={return_recv_hook}"
-                                        f"|fp8={dispatch_use_fp8}"
+                                        f"|fp8={dispatch_use_fp8_case}"
                                         f"|rs={round_scale}"
                                         f"|ue={use_ue8m0}"
                                         f"|zc={zero_copy}"
@@ -484,13 +485,13 @@ def test_func(return_recv_hook: bool):
             num_tokens,
             num_experts,
             cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-            use_fp8=True,
+            use_fp8=dispatch_use_fp8,
             async_finish=False,
             return_recv_hook=return_recv_hook,
         )
         large_gemm_with_hook(hook) if return_recv_hook else None
         # Phase alignment: ensure all ranks finish dispatch before any starts combine
-        # dist.barrier(group=group)
+        dist.barrier(group=group)
         combined_x, event, hook = buffer.low_latency_combine(
             simulated_gemm_x,
             topk_idx,
@@ -501,13 +502,12 @@ def test_func(return_recv_hook: bool):
         )
         large_gemm_with_hook(hook) if return_recv_hook else None
         # Phase alignment: ensure all ranks finish dispatch before any starts combine
-        # dist.barrier(group=group)
+        dist.barrier(group=group)
 
     print("✓ All correctness tests passed!", flush=True)
 
     # Lam: If stop_after_first, just run once to check the code
-    #  or stop_after_first
-    if skip_benchmark:
+    if skip_benchmark or stop_after_first:
         return (hash_value, hash_details) if debug_hash else hash_value
 
     # Calculate bandwidth
@@ -516,7 +516,9 @@ def test_func(return_recv_hook: bool):
     num_dispatch_comm_bytes, num_combine_comm_bytes = 0, 0
     for i in range(num_tokens):
         num_selections = (topk_idx[i] != -1).sum().item()
-        num_dispatch_comm_bytes += num_fp8_bytes * num_selections
+        num_dispatch_comm_bytes += (
+            num_fp8_bytes if dispatch_use_fp8 else num_bf16_bytes
+        ) * num_selections
         num_combine_comm_bytes += (
             num_logfmt10_bytes if use_logfmt else num_bf16_bytes
         ) * num_selections
@@ -588,6 +590,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
             group,
             buffer,
             use_logfmt=args.use_logfmt,
+            dispatch_use_fp8=args.dispatch_use_fp8,
             seed=seed,
             skip_benchmark=args.pressure_test_mode == 1,
             debug_hash=args.debug_hash,
@@ -616,6 +619,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
                 group,
                 buffer,
                 use_logfmt=args.use_logfmt,
+                dispatch_use_fp8=args.dispatch_use_fp8,
                 seed=seed,
                 skip_benchmark=args.pressure_test_mode == 1,
                 debug_hash=args.debug_hash,
@@ -700,6 +704,13 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     parser.add_argument(
         "--use-logfmt", action="store_true", help="Whether to test LogFMT combine"
     )
+    parser.add_argument(
+        "--dispatch-use-fp8",
+        type=bool,
+        default=True,
+        action=argparse.BooleanOptionalAction,
+        help="Whether dispatch path uses FP8 casting (default: true).",
+    )
     parser.add_argument(
         "--pressure-test-mode",
         type=int,

From a6ec671635198ef88f53454ff06380e2b04b5bef Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:03:59 -0500
Subject: [PATCH 12/15] add plotting

---
 .../dispatch_bytes_per_message_breakdown.svg  | 2762 +++++++++++++++++
 ep/bench/plots/plot_dispatch_message_sizes.py |  205 ++
 ep/bench/plots/w_batch_data.txt               |  142 +
 ep/bench/plots/wo_batch_data.txt              |  505 +++
 4 files changed, 3614 insertions(+)
 create mode 100644 ep/bench/plots/dispatch_bytes_per_message_breakdown.svg
 create mode 100644 ep/bench/plots/plot_dispatch_message_sizes.py
 create mode 100644 ep/bench/plots/w_batch_data.txt
 create mode 100644 ep/bench/plots/wo_batch_data.txt

diff --git a/ep/bench/plots/dispatch_bytes_per_message_breakdown.svg b/ep/bench/plots/dispatch_bytes_per_message_breakdown.svg
new file mode 100644
index 000000000..8199baa87
--- /dev/null
+++ b/ep/bench/plots/dispatch_bytes_per_message_breakdown.svg
@@ -0,0 +1,2762 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="1152pt" height="720pt" viewBox="0 0 1152 720" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-02-15T19:40:25.606406</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 720 
+L 1152 720 
+L 1152 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 55.490865 251.50992 
+L 470.34375 251.50992 
+L 470.34375 35.732595 
+L 55.490865 35.732595 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 59.224541 251.50992 
+L 93.242478 251.50992 
+L 93.242478 251.50992 
+L 59.224541 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 100.709829 251.50992 
+L 134.727766 251.50992 
+L 134.727766 46.007706 
+L 100.709829 46.007706 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 142.195118 251.50992 
+L 176.213055 251.50992 
+L 176.213055 251.50992 
+L 142.195118 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 183.680406 251.50992 
+L 217.698343 251.50992 
+L 217.698343 251.50992 
+L 183.680406 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_7">
+    <path d="M 225.165695 251.50992 
+L 259.183632 251.50992 
+L 259.183632 251.50992 
+L 225.165695 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_8">
+    <path d="M 266.650983 251.50992 
+L 300.66892 251.50992 
+L 300.66892 251.50992 
+L 266.650983 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_9">
+    <path d="M 308.136272 251.50992 
+L 342.154209 251.50992 
+L 342.154209 251.50992 
+L 308.136272 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_10">
+    <path d="M 349.62156 251.50992 
+L 383.639497 251.50992 
+L 383.639497 251.50992 
+L 349.62156 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 391.106849 251.50992 
+L 425.124786 251.50992 
+L 425.124786 251.50992 
+L 391.106849 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="patch_12">
+    <path d="M 432.592137 251.50992 
+L 466.610074 251.50992 
+L 466.610074 251.50992 
+L 432.592137 251.50992 
+z
+" clip-path="url(#pc3c396d96d)" style="fill: #ff7f0e; opacity: 0.85"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m4b83d77825" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m4b83d77825" x="76.233509" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 0-14KB -->
+      <g transform="translate(43.91797 283.147391) rotate(-30) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(99.707031 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(292.529297 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m4b83d77825" x="117.718798" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 14-28KB -->
+      <g transform="translate(79.893172 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m4b83d77825" x="159.204086" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 28-42KB -->
+      <g transform="translate(121.37846 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m4b83d77825" x="200.689375" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 42-56KB -->
+      <g transform="translate(162.863749 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m4b83d77825" x="242.174663" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 56-70KB -->
+      <g transform="translate(204.349037 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m4b83d77825" x="283.659952" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- 70-84KB -->
+      <g transform="translate(245.834326 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-37"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m4b83d77825" x="325.14524" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 84-98KB -->
+      <g transform="translate(287.319614 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-39" d="M 703 97 
+L 703 672 
+Q 941 559 1184 500 
+Q 1428 441 1663 441 
+Q 2288 441 2617 861 
+Q 2947 1281 2994 2138 
+Q 2813 1869 2534 1725 
+Q 2256 1581 1919 1581 
+Q 1219 1581 811 2004 
+Q 403 2428 403 3163 
+Q 403 3881 828 4315 
+Q 1253 4750 1959 4750 
+Q 2769 4750 3195 4129 
+Q 3622 3509 3622 2328 
+Q 3622 1225 3098 567 
+Q 2575 -91 1691 -91 
+Q 1453 -91 1209 -44 
+Q 966 3 703 97 
+z
+M 1959 2075 
+Q 2384 2075 2632 2365 
+Q 2881 2656 2881 3163 
+Q 2881 3666 2632 3958 
+Q 2384 4250 1959 4250 
+Q 1534 4250 1286 3958 
+Q 1038 3666 1038 3163 
+Q 1038 2656 1286 2365 
+Q 1534 2075 1959 2075 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-38"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-39" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m4b83d77825" x="366.630529" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 98-112KB -->
+      <g transform="translate(323.294816 289.509891) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-39"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(354.199219 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(419.775391 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m4b83d77825" x="408.115817" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 112-126KB -->
+      <g transform="translate(359.270018 292.691141) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(354.199219 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(417.822266 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(483.398438 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#m4b83d77825" x="449.601106" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 126-140KB -->
+      <g transform="translate(400.755307 292.691141) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(354.199219 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(417.822266 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(483.398438 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_11">
+     <!-- message-size bucket -->
+     <g transform="translate(210.515745 306.09064) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-7a" d="M 353 3500 
+L 3084 3500 
+L 3084 2975 
+L 922 459 
+L 3084 459 
+L 3084 0 
+L 275 0 
+L 275 525 
+L 2438 3041 
+L 353 3041 
+L 353 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-62" d="M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+M 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2969 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-6d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(97.412109 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(158.935547 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(211.035156 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(263.134766 0)"/>
+      <use xlink:href="#DejaVuSans-67" transform="translate(324.414062 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(387.890625 0)"/>
+      <use xlink:href="#DejaVuSans-2d" transform="translate(449.414062 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(485.498047 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(537.597656 0)"/>
+      <use xlink:href="#DejaVuSans-7a" transform="translate(565.380859 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(617.871094 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(679.394531 0)"/>
+      <use xlink:href="#DejaVuSans-62" transform="translate(711.181641 0)"/>
+      <use xlink:href="#DejaVuSans-75" transform="translate(774.658203 0)"/>
+      <use xlink:href="#DejaVuSans-63" transform="translate(838.037109 0)"/>
+      <use xlink:href="#DejaVuSans-6b" transform="translate(893.017578 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(947.302734 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(1008.826172 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_11">
+      <path d="M 55.490865 251.50992 
+L 470.34375 251.50992 
+" clip-path="url(#pc3c396d96d)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_12">
+      <defs>
+       <path id="mb5eeaec12c" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 0 -->
+      <g transform="translate(42.128365 255.309138) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_13">
+      <path d="M 55.490865 210.816412 
+L 470.34375 210.816412 
+" clip-path="url(#pc3c396d96d)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_14">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="210.816412" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 100 -->
+      <g transform="translate(29.403365 214.615631) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_15">
+      <path d="M 55.490865 170.122904 
+L 470.34375 170.122904 
+" clip-path="url(#pc3c396d96d)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="170.122904" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 200 -->
+      <g transform="translate(29.403365 173.922123) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_17">
+      <path d="M 55.490865 129.429396 
+L 470.34375 129.429396 
+" clip-path="url(#pc3c396d96d)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_18">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="129.429396" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- 300 -->
+      <g transform="translate(29.403365 133.228615) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_19">
+      <path d="M 55.490865 88.735889 
+L 470.34375 88.735889 
+" clip-path="url(#pc3c396d96d)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="88.735889" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_16">
+      <!-- 400 -->
+      <g transform="translate(29.403365 92.535108) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_21">
+      <path d="M 55.490865 48.042381 
+L 470.34375 48.042381 
+" clip-path="url(#pc3c396d96d)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="48.042381" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_17">
+      <!-- 500 -->
+      <g transform="translate(29.403365 51.8416) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_18">
+     <!-- count -->
+     <g transform="translate(23.323678 157.727507) rotate(-90) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-63"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(54.980469 0)"/>
+      <use xlink:href="#DejaVuSans-75" transform="translate(116.162109 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(179.541016 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(242.919922 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="patch_13">
+    <path d="M 55.490865 251.50992 
+L 55.490865 35.732595 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 470.34375 251.50992 
+L 470.34375 35.732595 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_15">
+    <path d="M 55.490865 251.50992 
+L 470.34375 251.50992 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_16">
+    <path d="M 55.490865 35.732595 
+L 470.34375 35.732595 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_19">
+    <g id="patch_17">
+     <path d="M 365.882005 63.570836 
+L 462.046692 63.570836 
+Q 465.546692 63.570836 465.546692 60.070836 
+L 465.546692 46.521461 
+Q 465.546692 43.021461 462.046692 43.021461 
+L 365.882005 43.021461 
+Q 362.382005 43.021461 362.382005 46.521461 
+L 362.382005 60.070836 
+Q 362.382005 63.570836 365.882005 63.570836 
+z
+" style="fill: #ffffff; opacity: 0.85; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <!-- Total WR: 505 -->
+    <g transform="translate(365.882005 57.159274) scale(0.14 -0.14)">
+     <defs>
+      <path id="DejaVuSans-54" d="M -19 4666 
+L 3928 4666 
+L 3928 4134 
+L 2272 4134 
+L 2272 0 
+L 1638 0 
+L 1638 4134 
+L -19 4134 
+L -19 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-57" d="M 213 4666 
+L 850 4666 
+L 1831 722 
+L 2809 4666 
+L 3519 4666 
+L 4500 722 
+L 5478 4666 
+L 6119 4666 
+L 4947 0 
+L 4153 0 
+L 3169 4050 
+L 2175 0 
+L 1381 0 
+L 213 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-3a" d="M 750 794 
+L 1409 794 
+L 1409 0 
+L 750 0 
+L 750 794 
+z
+M 750 3309 
+L 1409 3309 
+L 1409 2516 
+L 750 2516 
+L 750 3309 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-54"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(44.083984 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(105.265625 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(144.474609 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(205.753906 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(233.537109 0)"/>
+     <use xlink:href="#DejaVuSans-57" transform="translate(265.324219 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(364.201172 0)"/>
+     <use xlink:href="#DejaVuSans-3a" transform="translate(430.558594 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(464.25 0)"/>
+     <use xlink:href="#DejaVuSans-35" transform="translate(496.037109 0)"/>
+     <use xlink:href="#DejaVuSans-30" transform="translate(559.660156 0)"/>
+     <use xlink:href="#DejaVuSans-35" transform="translate(623.283203 0)"/>
+    </g>
+   </g>
+   <g id="text_20">
+    <!-- without batch message-size buckets (total WR=505) -->
+    <g transform="translate(104.80512 29.732595) scale(0.12 -0.12)">
+     <defs>
+      <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-28" d="M 1984 4856 
+Q 1566 4138 1362 3434 
+Q 1159 2731 1159 2009 
+Q 1159 1288 1364 580 
+Q 1569 -128 1984 -844 
+L 1484 -844 
+Q 1016 -109 783 600 
+Q 550 1309 550 2009 
+Q 550 2706 781 3412 
+Q 1013 4119 1484 4856 
+L 1984 4856 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-3d" d="M 678 2906 
+L 4684 2906 
+L 4684 2381 
+L 678 2381 
+L 678 2906 
+z
+M 678 1631 
+L 4684 1631 
+L 4684 1100 
+L 678 1100 
+L 678 1631 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSans-29" d="M 513 4856 
+L 1013 4856 
+Q 1481 4119 1714 3412 
+Q 1947 2706 1947 2009 
+Q 1947 1309 1714 600 
+Q 1481 -109 1013 -844 
+L 513 -844 
+Q 928 -128 1133 580 
+Q 1338 1288 1338 2009 
+Q 1338 2731 1133 3434 
+Q 928 4138 513 4856 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-77"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(81.787109 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(109.570312 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(148.779297 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(212.158203 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(273.339844 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(336.71875 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(375.927734 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(407.714844 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(471.191406 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(532.470703 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(571.679688 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(626.660156 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(690.039062 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(721.826172 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(819.238281 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(880.761719 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(932.861328 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(984.960938 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(1046.240234 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(1109.716797 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(1171.240234 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1207.324219 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(1259.423828 0)"/>
+     <use xlink:href="#DejaVuSans-7a" transform="translate(1287.207031 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(1339.697266 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1401.220703 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1433.007812 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1496.484375 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(1559.863281 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(1614.84375 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(1669.128906 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1730.652344 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1769.861328 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1821.960938 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(1853.748047 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1892.761719 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(1931.970703 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1993.152344 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(2032.361328 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(2093.640625 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(2121.423828 0)"/>
+     <use xlink:href="#DejaVuSans-57" transform="translate(2153.210938 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(2252.087891 0)"/>
+     <use xlink:href="#DejaVuSans-3d" transform="translate(2321.570312 0)"/>
+     <use xlink:href="#DejaVuSans-35" transform="translate(2405.359375 0)"/>
+     <use xlink:href="#DejaVuSans-30" transform="translate(2468.982422 0)"/>
+     <use xlink:href="#DejaVuSans-35" transform="translate(2532.605469 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(2596.228516 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_2">
+   <g id="patch_18">
+    <path d="M 734.146875 251.50992 
+L 1148.99976 251.50992 
+L 1148.99976 35.732595 
+L 734.146875 35.732595 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="patch_19">
+    <path d="M 737.880551 251.50992 
+L 771.898488 251.50992 
+L 771.898488 251.50992 
+L 737.880551 251.50992 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_20">
+    <path d="M 779.365839 251.50992 
+L 813.383776 251.50992 
+L 813.383776 169.309034 
+L 779.365839 169.309034 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_21">
+    <path d="M 820.851128 251.50992 
+L 854.869065 251.50992 
+L 854.869065 46.007706 
+L 820.851128 46.007706 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_22">
+    <path d="M 862.336416 251.50992 
+L 896.354353 251.50992 
+L 896.354353 81.236657 
+L 862.336416 81.236657 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_23">
+    <path d="M 903.821705 251.50992 
+L 937.839642 251.50992 
+L 937.839642 134.080083 
+L 903.821705 134.080083 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_24">
+    <path d="M 945.306993 251.50992 
+L 979.32493 251.50992 
+L 979.32493 116.465608 
+L 945.306993 116.465608 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_25">
+    <path d="M 986.792282 251.50992 
+L 1020.810219 251.50992 
+L 1020.810219 181.052018 
+L 986.792282 181.052018 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_26">
+    <path d="M 1028.27757 251.50992 
+L 1062.295507 251.50992 
+L 1062.295507 216.280969 
+L 1028.27757 216.280969 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_27">
+    <path d="M 1069.762859 251.50992 
+L 1103.780796 251.50992 
+L 1103.780796 239.766936 
+L 1069.762859 239.766936 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="patch_28">
+    <path d="M 1111.248147 251.50992 
+L 1145.266084 251.50992 
+L 1145.266084 245.638428 
+L 1111.248147 245.638428 
+z
+" clip-path="url(#p910a5f57ba)" style="fill: #1f77b4; opacity: 0.85"/>
+   </g>
+   <g id="matplotlib.axis_3">
+    <g id="xtick_11">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#m4b83d77825" x="754.889519" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_21">
+      <!-- 0-14KB -->
+      <g transform="translate(722.57398 283.147391) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-30"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(99.707031 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(292.529297 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_24">
+      <g>
+       <use xlink:href="#m4b83d77825" x="796.374808" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_22">
+      <!-- 14-28KB -->
+      <g transform="translate(758.549182 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_25">
+      <g>
+       <use xlink:href="#m4b83d77825" x="837.860096" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_23">
+      <!-- 28-42KB -->
+      <g transform="translate(800.03447 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_26">
+      <g>
+       <use xlink:href="#m4b83d77825" x="879.345385" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_24">
+      <!-- 42-56KB -->
+      <g transform="translate(841.519759 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#m4b83d77825" x="920.830673" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_25">
+      <!-- 56-70KB -->
+      <g transform="translate(883.005047 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_28">
+      <g>
+       <use xlink:href="#m4b83d77825" x="962.315962" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_26">
+      <!-- 70-84KB -->
+      <g transform="translate(924.490336 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-37"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m4b83d77825" x="1003.80125" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_27">
+      <!-- 84-98KB -->
+      <g transform="translate(965.975624 286.328641) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-38"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-39" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(356.152344 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_30">
+      <g>
+       <use xlink:href="#m4b83d77825" x="1045.286539" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_28">
+      <!-- 98-112KB -->
+      <g transform="translate(1001.950826 289.509891) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-39"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(163.330078 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(354.199219 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(419.775391 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m4b83d77825" x="1086.771827" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_29">
+      <!-- 112-126KB -->
+      <g transform="translate(1037.926028 292.691141) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(354.199219 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(417.822266 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(483.398438 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_32">
+      <g>
+       <use xlink:href="#m4b83d77825" x="1128.257116" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_30">
+      <!-- 126-140KB -->
+      <g transform="translate(1079.411317 292.691141) rotate(-30) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-2d" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(226.953125 0)"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(290.576172 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(354.199219 0)"/>
+       <use xlink:href="#DejaVuSans-4b" transform="translate(417.822266 0)"/>
+       <use xlink:href="#DejaVuSans-42" transform="translate(483.398438 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_31">
+     <!-- message-size bucket -->
+     <g transform="translate(889.171755 306.09064) scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-6d"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(97.412109 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(158.935547 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(211.035156 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(263.134766 0)"/>
+      <use xlink:href="#DejaVuSans-67" transform="translate(324.414062 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(387.890625 0)"/>
+      <use xlink:href="#DejaVuSans-2d" transform="translate(449.414062 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(485.498047 0)"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(537.597656 0)"/>
+      <use xlink:href="#DejaVuSans-7a" transform="translate(565.380859 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(617.871094 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(679.394531 0)"/>
+      <use xlink:href="#DejaVuSans-62" transform="translate(711.181641 0)"/>
+      <use xlink:href="#DejaVuSans-75" transform="translate(774.658203 0)"/>
+      <use xlink:href="#DejaVuSans-63" transform="translate(838.037109 0)"/>
+      <use xlink:href="#DejaVuSans-6b" transform="translate(893.017578 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(947.302734 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(1008.826172 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_4">
+    <g id="ytick_7">
+     <g id="line2d_33">
+      <path d="M 734.146875 251.50992 
+L 1148.99976 251.50992 
+" clip-path="url(#p910a5f57ba)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_34">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="734.146875" y="251.50992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_32">
+      <!-- 0 -->
+      <g transform="translate(720.784375 255.309138) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_35">
+      <path d="M 734.146875 222.15246 
+L 1148.99976 222.15246 
+" clip-path="url(#p910a5f57ba)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_36">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="734.146875" y="222.15246" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_33">
+      <!-- 5 -->
+      <g transform="translate(720.784375 225.951679) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-35"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="line2d_37">
+      <path d="M 734.146875 192.795001 
+L 1148.99976 192.795001 
+" clip-path="url(#p910a5f57ba)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="734.146875" y="192.795001" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_34">
+      <!-- 10 -->
+      <g transform="translate(714.421875 196.59422) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_39">
+      <path d="M 734.146875 163.437542 
+L 1148.99976 163.437542 
+" clip-path="url(#p910a5f57ba)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="734.146875" y="163.437542" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_35">
+      <!-- 15 -->
+      <g transform="translate(714.421875 167.236761) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_41">
+      <path d="M 734.146875 134.080083 
+L 1148.99976 134.080083 
+" clip-path="url(#p910a5f57ba)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="734.146875" y="134.080083" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_36">
+      <!-- 20 -->
+      <g transform="translate(714.421875 137.879302) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_43">
+      <path d="M 734.146875 104.722624 
+L 1148.99976 104.722624 
+" clip-path="url(#p910a5f57ba)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_44">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="734.146875" y="104.722624" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_37">
+      <!-- 25 -->
+      <g transform="translate(714.421875 108.521843) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_45">
+      <path d="M 734.146875 75.365165 
+L 1148.99976 75.365165 
+" clip-path="url(#p910a5f57ba)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_46">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="734.146875" y="75.365165" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_38">
+      <!-- 30 -->
+      <g transform="translate(714.421875 79.164384) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_47">
+      <path d="M 734.146875 46.007706 
+L 1148.99976 46.007706 
+" clip-path="url(#p910a5f57ba)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_48">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="734.146875" y="46.007706" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_39">
+      <!-- 35 -->
+      <g transform="translate(714.421875 49.806924) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_40">
+     <!-- count -->
+     <g transform="translate(708.342188 157.727507) rotate(-90) scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-63"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(54.980469 0)"/>
+      <use xlink:href="#DejaVuSans-75" transform="translate(116.162109 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(179.541016 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(242.919922 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="patch_29">
+    <path d="M 734.146875 251.50992 
+L 734.146875 35.732595 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_30">
+    <path d="M 1148.99976 251.50992 
+L 1148.99976 35.732595 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_31">
+    <path d="M 734.146875 251.50992 
+L 1148.99976 251.50992 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_32">
+    <path d="M 734.146875 35.732595 
+L 1148.99976 35.732595 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_41">
+    <g id="patch_33">
+     <path d="M 1044.538015 63.570836 
+L 1140.702702 63.570836 
+Q 1144.202702 63.570836 1144.202702 60.070836 
+L 1144.202702 46.521461 
+Q 1144.202702 43.021461 1140.702702 43.021461 
+L 1044.538015 43.021461 
+Q 1041.038015 43.021461 1041.038015 46.521461 
+L 1041.038015 60.070836 
+Q 1041.038015 63.570836 1044.538015 63.570836 
+z
+" style="fill: #ffffff; opacity: 0.85; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <!-- Total WR: 142 -->
+    <g transform="translate(1044.538015 57.159274) scale(0.14 -0.14)">
+     <use xlink:href="#DejaVuSans-54"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(44.083984 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(105.265625 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(144.474609 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(205.753906 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(233.537109 0)"/>
+     <use xlink:href="#DejaVuSans-57" transform="translate(265.324219 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(364.201172 0)"/>
+     <use xlink:href="#DejaVuSans-3a" transform="translate(430.558594 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(464.25 0)"/>
+     <use xlink:href="#DejaVuSans-31" transform="translate(496.037109 0)"/>
+     <use xlink:href="#DejaVuSans-34" transform="translate(559.660156 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(623.283203 0)"/>
+    </g>
+   </g>
+   <g id="text_42">
+    <!-- with batch message-size buckets (total WR=142) -->
+    <g transform="translate(793.287068 29.732595) scale(0.12 -0.12)">
+     <use xlink:href="#DejaVuSans-77"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(81.787109 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(109.570312 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(148.779297 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(212.158203 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(243.945312 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(307.421875 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(368.701172 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(407.910156 0)"/>
+     <use xlink:href="#DejaVuSans-68" transform="translate(462.890625 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(526.269531 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(558.056641 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(655.46875 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(716.992188 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(769.091797 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(821.191406 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(882.470703 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(945.947266 0)"/>
+     <use xlink:href="#DejaVuSans-2d" transform="translate(1007.470703 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1043.554688 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(1095.654297 0)"/>
+     <use xlink:href="#DejaVuSans-7a" transform="translate(1123.4375 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(1175.927734 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1237.451172 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1269.238281 0)"/>
+     <use xlink:href="#DejaVuSans-75" transform="translate(1332.714844 0)"/>
+     <use xlink:href="#DejaVuSans-63" transform="translate(1396.09375 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(1451.074219 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(1505.359375 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1566.882812 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1606.091797 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1658.191406 0)"/>
+     <use xlink:href="#DejaVuSans-28" transform="translate(1689.978516 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1728.992188 0)"/>
+     <use xlink:href="#DejaVuSans-6f" transform="translate(1768.201172 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1829.382812 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(1868.591797 0)"/>
+     <use xlink:href="#DejaVuSans-6c" transform="translate(1929.871094 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1957.654297 0)"/>
+     <use xlink:href="#DejaVuSans-57" transform="translate(1989.441406 0)"/>
+     <use xlink:href="#DejaVuSans-52" transform="translate(2088.318359 0)"/>
+     <use xlink:href="#DejaVuSans-3d" transform="translate(2157.800781 0)"/>
+     <use xlink:href="#DejaVuSans-31" transform="translate(2241.589844 0)"/>
+     <use xlink:href="#DejaVuSans-34" transform="translate(2305.212891 0)"/>
+     <use xlink:href="#DejaVuSans-32" transform="translate(2368.835938 0)"/>
+     <use xlink:href="#DejaVuSans-29" transform="translate(2432.458984 0)"/>
+    </g>
+   </g>
+  </g>
+  <g id="axes_3">
+   <g id="patch_34">
+    <path d="M 55.490865 686.64351 
+L 1148.99976 686.64351 
+L 1148.99976 449.288453 
+L 55.490865 449.288453 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="patch_35">
+    <path d="M 105.195815 686.64351 
+L 153.87592 686.64351 
+L 153.87592 631.657782 
+L 105.195815 631.657782 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #ff7f0e"/>
+   </g>
+   <g id="patch_36">
+    <path d="M 233.301355 686.64351 
+L 281.981461 686.64351 
+L 281.981461 631.657782 
+L 233.301355 631.657782 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #ff7f0e"/>
+   </g>
+   <g id="patch_37">
+    <path d="M 361.406896 686.64351 
+L 410.087002 686.64351 
+L 410.087002 631.657782 
+L 361.406896 631.657782 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #ff7f0e"/>
+   </g>
+   <g id="patch_38">
+    <path d="M 489.512437 686.64351 
+L 538.192542 686.64351 
+L 538.192542 631.657782 
+L 489.512437 631.657782 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #ff7f0e"/>
+   </g>
+   <g id="patch_39">
+    <path d="M 617.617977 686.64351 
+L 666.298083 686.64351 
+L 666.298083 631.657782 
+L 617.617977 631.657782 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #ff7f0e"/>
+   </g>
+   <g id="patch_40">
+    <path d="M 745.723518 686.64351 
+L 794.403623 686.64351 
+L 794.403623 631.657782 
+L 745.723518 631.657782 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #ff7f0e"/>
+   </g>
+   <g id="patch_41">
+    <path d="M 873.829059 686.64351 
+L 922.509164 686.64351 
+L 922.509164 631.657782 
+L 873.829059 631.657782 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #ff7f0e"/>
+   </g>
+   <g id="patch_42">
+    <path d="M 1001.934599 686.64351 
+L 1050.614705 686.64351 
+L 1050.614705 631.657782 
+L 1001.934599 631.657782 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #ff7f0e"/>
+   </g>
+   <g id="patch_43">
+    <path d="M 153.87592 686.64351 
+L 202.556026 686.64351 
+L 202.556026 512.52204 
+L 153.87592 512.52204 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #1f77b4"/>
+   </g>
+   <g id="patch_44">
+    <path d="M 281.981461 686.64351 
+L 330.661566 686.64351 
+L 330.661566 460.591075 
+L 281.981461 460.591075 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #1f77b4"/>
+   </g>
+   <g id="patch_45">
+    <path d="M 410.087002 686.64351 
+L 458.767107 686.64351 
+L 458.767107 469.935055 
+L 410.087002 469.935055 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #1f77b4"/>
+   </g>
+   <g id="patch_46">
+    <path d="M 538.192542 686.64351 
+L 586.872648 686.64351 
+L 586.872648 509.467277 
+L 538.192542 509.467277 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #1f77b4"/>
+   </g>
+   <g id="patch_47">
+    <path d="M 666.298083 686.64351 
+L 714.978188 686.64351 
+L 714.978188 512.52204 
+L 666.298083 512.52204 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #1f77b4"/>
+   </g>
+   <g id="patch_48">
+    <path d="M 794.403623 686.64351 
+L 843.083729 686.64351 
+L 843.083729 497.248226 
+L 794.403623 497.248226 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #1f77b4"/>
+   </g>
+   <g id="patch_49">
+    <path d="M 922.509164 686.64351 
+L 971.18927 686.64351 
+L 971.18927 486.107327 
+L 922.509164 486.107327 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #1f77b4"/>
+   </g>
+   <g id="patch_50">
+    <path d="M 1050.614705 686.64351 
+L 1099.29481 686.64351 
+L 1099.29481 478.919651 
+L 1050.614705 478.919651 
+z
+" clip-path="url(#pfcf2d0cca5)" style="fill: #1f77b4"/>
+   </g>
+   <g id="matplotlib.axis_5">
+    <g id="xtick_21">
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#m4b83d77825" x="153.87592" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_43">
+      <!-- 8 -->
+      <g transform="translate(150.69467 701.241948) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-38"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_22">
+     <g id="line2d_50">
+      <g>
+       <use xlink:href="#m4b83d77825" x="281.981461" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_44">
+      <!-- 9 -->
+      <g transform="translate(278.800211 701.241948) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-39"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_23">
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m4b83d77825" x="410.087002" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_45">
+      <!-- 10 -->
+      <g transform="translate(403.724502 701.241948) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_24">
+     <g id="line2d_52">
+      <g>
+       <use xlink:href="#m4b83d77825" x="538.192542" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_46">
+      <!-- 11 -->
+      <g transform="translate(531.830042 701.241948) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_25">
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#m4b83d77825" x="666.298083" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_47">
+      <!-- 12 -->
+      <g transform="translate(659.935583 701.241948) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_26">
+     <g id="line2d_54">
+      <g>
+       <use xlink:href="#m4b83d77825" x="794.403623" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_48">
+      <!-- 13 -->
+      <g transform="translate(788.041123 701.241948) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-33" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_27">
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#m4b83d77825" x="922.509164" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_49">
+      <!-- 14 -->
+      <g transform="translate(916.146664 701.241948) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-34" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_28">
+     <g id="line2d_56">
+      <g>
+       <use xlink:href="#m4b83d77825" x="1050.614705" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_50">
+      <!-- 15 -->
+      <g transform="translate(1044.252205 701.241948) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-35" transform="translate(63.623047 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_51">
+     <!-- dst rank -->
+     <g transform="translate(581.733594 714.920072) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-64"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(63.476562 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(115.576172 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(154.785156 0)"/>
+      <use xlink:href="#DejaVuSans-72" transform="translate(186.572266 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(227.685547 0)"/>
+      <use xlink:href="#DejaVuSans-6e" transform="translate(288.964844 0)"/>
+      <use xlink:href="#DejaVuSans-6b" transform="translate(352.34375 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_6">
+    <g id="ytick_15">
+     <g id="line2d_57">
+      <path d="M 55.490865 686.64351 
+L 1148.99976 686.64351 
+" clip-path="url(#pfcf2d0cca5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_58">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="686.64351" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_52">
+      <!-- 0 -->
+      <g transform="translate(42.128365 690.442729) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_59">
+      <path d="M 55.490865 648.33127 
+L 1148.99976 648.33127 
+" clip-path="url(#pfcf2d0cca5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_60">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="648.33127" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_53">
+      <!-- 10000 -->
+      <g transform="translate(16.678365 652.130488) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(254.492188 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_17">
+     <g id="line2d_61">
+      <path d="M 55.490865 610.019029 
+L 1148.99976 610.019029 
+" clip-path="url(#pfcf2d0cca5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_62">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="610.019029" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_54">
+      <!-- 20000 -->
+      <g transform="translate(16.678365 613.818248) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(254.492188 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_63">
+      <path d="M 55.490865 571.706789 
+L 1148.99976 571.706789 
+" clip-path="url(#pfcf2d0cca5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_64">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="571.706789" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_55">
+      <!-- 30000 -->
+      <g transform="translate(16.678365 575.506007) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(254.492188 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_65">
+      <path d="M 55.490865 533.394548 
+L 1148.99976 533.394548 
+" clip-path="url(#pfcf2d0cca5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_66">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="533.394548" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_56">
+      <!-- 40000 -->
+      <g transform="translate(16.678365 537.193767) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(254.492188 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_67">
+      <path d="M 55.490865 495.082308 
+L 1148.99976 495.082308 
+" clip-path="url(#pfcf2d0cca5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_68">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="495.082308" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_57">
+      <!-- 50000 -->
+      <g transform="translate(16.678365 498.881526) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(254.492188 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_21">
+     <g id="line2d_69">
+      <path d="M 55.490865 456.770067 
+L 1148.99976 456.770067 
+" clip-path="url(#pfcf2d0cca5)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.25; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_70">
+      <g>
+       <use xlink:href="#mb5eeaec12c" x="55.490865" y="456.770067" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_58">
+      <!-- 60000 -->
+      <g transform="translate(16.678365 460.569286) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(127.246094 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(190.869141 0)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(254.492188 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_59">
+     <!-- avg bytes/message -->
+     <g transform="translate(10.598678 616.683169) rotate(-90) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2f" d="M 1625 4666 
+L 2156 4666 
+L 531 -594 
+L 0 -594 
+L 1625 4666 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-61"/>
+      <use xlink:href="#DejaVuSans-76" transform="translate(61.279297 0)"/>
+      <use xlink:href="#DejaVuSans-67" transform="translate(120.458984 0)"/>
+      <use xlink:href="#DejaVuSans-20" transform="translate(183.935547 0)"/>
+      <use xlink:href="#DejaVuSans-62" transform="translate(215.722656 0)"/>
+      <use xlink:href="#DejaVuSans-79" transform="translate(279.199219 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(338.378906 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(377.587891 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(439.111328 0)"/>
+      <use xlink:href="#DejaVuSans-2f" transform="translate(491.210938 0)"/>
+      <use xlink:href="#DejaVuSans-6d" transform="translate(524.902344 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(622.314453 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(683.837891 0)"/>
+      <use xlink:href="#DejaVuSans-73" transform="translate(735.9375 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(788.037109 0)"/>
+      <use xlink:href="#DejaVuSans-67" transform="translate(849.316406 0)"/>
+      <use xlink:href="#DejaVuSans-65" transform="translate(912.792969 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="patch_51">
+    <path d="M 55.490865 686.64351 
+L 55.490865 449.288453 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_52">
+    <path d="M 1148.99976 686.64351 
+L 1148.99976 449.288453 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_53">
+    <path d="M 55.490865 686.64351 
+L 1148.99976 686.64351 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_54">
+    <path d="M 55.490865 449.288453 
+L 1148.99976 449.288453 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_60">
+    <!-- Average message size by dst rank -->
+    <g transform="translate(499.405312 443.288453) scale(0.12 -0.12)">
+     <defs>
+      <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-41"/>
+     <use xlink:href="#DejaVuSans-76" transform="translate(62.533203 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(121.712891 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(183.236328 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(224.349609 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(285.628906 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(349.105469 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(410.628906 0)"/>
+     <use xlink:href="#DejaVuSans-6d" transform="translate(442.416016 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(539.828125 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(601.351562 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(653.451172 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(705.550781 0)"/>
+     <use xlink:href="#DejaVuSans-67" transform="translate(766.830078 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(830.306641 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(891.830078 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(923.617188 0)"/>
+     <use xlink:href="#DejaVuSans-69" transform="translate(975.716797 0)"/>
+     <use xlink:href="#DejaVuSans-7a" transform="translate(1003.5 0)"/>
+     <use xlink:href="#DejaVuSans-65" transform="translate(1055.990234 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1117.513672 0)"/>
+     <use xlink:href="#DejaVuSans-62" transform="translate(1149.300781 0)"/>
+     <use xlink:href="#DejaVuSans-79" transform="translate(1212.777344 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1271.957031 0)"/>
+     <use xlink:href="#DejaVuSans-64" transform="translate(1303.744141 0)"/>
+     <use xlink:href="#DejaVuSans-73" transform="translate(1367.220703 0)"/>
+     <use xlink:href="#DejaVuSans-74" transform="translate(1419.320312 0)"/>
+     <use xlink:href="#DejaVuSans-20" transform="translate(1458.529297 0)"/>
+     <use xlink:href="#DejaVuSans-72" transform="translate(1490.316406 0)"/>
+     <use xlink:href="#DejaVuSans-61" transform="translate(1531.429688 0)"/>
+     <use xlink:href="#DejaVuSans-6e" transform="translate(1592.708984 0)"/>
+     <use xlink:href="#DejaVuSans-6b" transform="translate(1656.087891 0)"/>
+    </g>
+   </g>
+   <g id="legend_1">
+    <g id="patch_55">
+     <path d="M 62.490865 487.200953 
+L 165.314302 487.200953 
+Q 167.314302 487.200953 167.314302 485.200953 
+L 167.314302 456.288453 
+Q 167.314302 454.288453 165.314302 454.288453 
+L 62.490865 454.288453 
+Q 60.490865 454.288453 60.490865 456.288453 
+L 60.490865 485.200953 
+Q 60.490865 487.200953 62.490865 487.200953 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <g id="patch_56">
+     <path d="M 64.490865 465.88689 
+L 84.490865 465.88689 
+L 84.490865 458.88689 
+L 64.490865 458.88689 
+z
+" style="fill: #ff7f0e"/>
+    </g>
+    <g id="text_61">
+     <!-- without_batch -->
+     <g transform="translate(92.490865 465.88689) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-5f" d="M 3263 -1063 
+L 3263 -1509 
+L -63 -1509 
+L -63 -1063 
+L 3263 -1063 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-77"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(81.787109 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(109.570312 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(148.779297 0)"/>
+      <use xlink:href="#DejaVuSans-6f" transform="translate(212.158203 0)"/>
+      <use xlink:href="#DejaVuSans-75" transform="translate(273.339844 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(336.71875 0)"/>
+      <use xlink:href="#DejaVuSans-5f" transform="translate(375.927734 0)"/>
+      <use xlink:href="#DejaVuSans-62" transform="translate(425.927734 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(489.404297 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(550.683594 0)"/>
+      <use xlink:href="#DejaVuSans-63" transform="translate(589.892578 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(644.873047 0)"/>
+     </g>
+    </g>
+    <g id="patch_57">
+     <path d="M 64.490865 480.84314 
+L 84.490865 480.84314 
+L 84.490865 473.84314 
+L 64.490865 473.84314 
+z
+" style="fill: #1f77b4"/>
+    </g>
+    <g id="text_62">
+     <!-- with_batch -->
+     <g transform="translate(92.490865 480.84314) scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-77"/>
+      <use xlink:href="#DejaVuSans-69" transform="translate(81.787109 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(109.570312 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(148.779297 0)"/>
+      <use xlink:href="#DejaVuSans-5f" transform="translate(212.158203 0)"/>
+      <use xlink:href="#DejaVuSans-62" transform="translate(262.158203 0)"/>
+      <use xlink:href="#DejaVuSans-61" transform="translate(325.634766 0)"/>
+      <use xlink:href="#DejaVuSans-74" transform="translate(386.914062 0)"/>
+      <use xlink:href="#DejaVuSans-63" transform="translate(426.123047 0)"/>
+      <use xlink:href="#DejaVuSans-68" transform="translate(481.103516 0)"/>
+     </g>
+    </g>
+   </g>
+  </g>
+  <g id="text_63">
+   <!-- Dispatch message-size breakdown (without batch WR=505, with batch WR=142) -->
+   <g transform="translate(331.54125 12.118365) scale(0.12 -0.12)">
+    <defs>
+     <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSans-44"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(77.001953 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(104.785156 0)"/>
+    <use xlink:href="#DejaVuSans-70" transform="translate(156.884766 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(220.361328 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(281.640625 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(320.849609 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(375.830078 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(439.208984 0)"/>
+    <use xlink:href="#DejaVuSans-6d" transform="translate(470.996094 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(568.408203 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(629.931641 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(682.03125 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(734.130859 0)"/>
+    <use xlink:href="#DejaVuSans-67" transform="translate(795.410156 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(858.886719 0)"/>
+    <use xlink:href="#DejaVuSans-2d" transform="translate(920.410156 0)"/>
+    <use xlink:href="#DejaVuSans-73" transform="translate(956.494141 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1008.59375 0)"/>
+    <use xlink:href="#DejaVuSans-7a" transform="translate(1036.376953 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1088.867188 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1150.390625 0)"/>
+    <use xlink:href="#DejaVuSans-62" transform="translate(1182.177734 0)"/>
+    <use xlink:href="#DejaVuSans-72" transform="translate(1245.654297 0)"/>
+    <use xlink:href="#DejaVuSans-65" transform="translate(1284.517578 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(1346.041016 0)"/>
+    <use xlink:href="#DejaVuSans-6b" transform="translate(1407.320312 0)"/>
+    <use xlink:href="#DejaVuSans-64" transform="translate(1465.230469 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(1528.707031 0)"/>
+    <use xlink:href="#DejaVuSans-77" transform="translate(1589.888672 0)"/>
+    <use xlink:href="#DejaVuSans-6e" transform="translate(1671.675781 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(1735.054688 0)"/>
+    <use xlink:href="#DejaVuSans-28" transform="translate(1766.841797 0)"/>
+    <use xlink:href="#DejaVuSans-77" transform="translate(1805.855469 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(1887.642578 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(1915.425781 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(1954.634766 0)"/>
+    <use xlink:href="#DejaVuSans-6f" transform="translate(2018.013672 0)"/>
+    <use xlink:href="#DejaVuSans-75" transform="translate(2079.195312 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(2142.574219 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2181.783203 0)"/>
+    <use xlink:href="#DejaVuSans-62" transform="translate(2213.570312 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(2277.046875 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(2338.326172 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(2377.535156 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(2432.515625 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(2495.894531 0)"/>
+    <use xlink:href="#DejaVuSans-57" transform="translate(2527.681641 0)"/>
+    <use xlink:href="#DejaVuSans-52" transform="translate(2626.558594 0)"/>
+    <use xlink:href="#DejaVuSans-3d" transform="translate(2696.041016 0)"/>
+    <use xlink:href="#DejaVuSans-35" transform="translate(2779.830078 0)"/>
+    <use xlink:href="#DejaVuSans-30" transform="translate(2843.453125 0)"/>
+    <use xlink:href="#DejaVuSans-35" transform="translate(2907.076172 0)"/>
+    <use xlink:href="#DejaVuSans-2c" transform="translate(2970.699219 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3002.486328 0)"/>
+    <use xlink:href="#DejaVuSans-77" transform="translate(3034.273438 0)"/>
+    <use xlink:href="#DejaVuSans-69" transform="translate(3116.060547 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(3143.84375 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(3183.052734 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3246.431641 0)"/>
+    <use xlink:href="#DejaVuSans-62" transform="translate(3278.21875 0)"/>
+    <use xlink:href="#DejaVuSans-61" transform="translate(3341.695312 0)"/>
+    <use xlink:href="#DejaVuSans-74" transform="translate(3402.974609 0)"/>
+    <use xlink:href="#DejaVuSans-63" transform="translate(3442.183594 0)"/>
+    <use xlink:href="#DejaVuSans-68" transform="translate(3497.164062 0)"/>
+    <use xlink:href="#DejaVuSans-20" transform="translate(3560.542969 0)"/>
+    <use xlink:href="#DejaVuSans-57" transform="translate(3592.330078 0)"/>
+    <use xlink:href="#DejaVuSans-52" transform="translate(3691.207031 0)"/>
+    <use xlink:href="#DejaVuSans-3d" transform="translate(3760.689453 0)"/>
+    <use xlink:href="#DejaVuSans-31" transform="translate(3844.478516 0)"/>
+    <use xlink:href="#DejaVuSans-34" transform="translate(3908.101562 0)"/>
+    <use xlink:href="#DejaVuSans-32" transform="translate(3971.724609 0)"/>
+    <use xlink:href="#DejaVuSans-29" transform="translate(4035.347656 0)"/>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pc3c396d96d">
+   <rect x="55.490865" y="35.732595" width="414.852885" height="215.777325"/>
+  </clipPath>
+  <clipPath id="p910a5f57ba">
+   <rect x="734.146875" y="35.732595" width="414.852885" height="215.777325"/>
+  </clipPath>
+  <clipPath id="pfcf2d0cca5">
+   <rect x="55.490865" y="449.288453" width="1093.508895" height="237.355057"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/ep/bench/plots/plot_dispatch_message_sizes.py b/ep/bench/plots/plot_dispatch_message_sizes.py
new file mode 100644
index 000000000..8cbfc14d9
--- /dev/null
+++ b/ep/bench/plots/plot_dispatch_message_sizes.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+import argparse
+import math
+import re
+from collections import defaultdict
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+MSG_RE = re.compile(
+    r"\[RDMA MSG dispatch\]\s+th=(?P<th>\d+)\s+src=(?P<src>\d+)\s+dst=(?P<dst>\d+)\s+msg_bytes=(?P<msg>\d+)"
+)
+POST_RE = re.compile(
+    r"\[RDMA POST dispatch-only\]\s+th=(?P<th>\d+)\s+src=(?P<src>\d+)\s+dst=(?P<dst>\d+)\s+dispatch_wrs=(?P<wrs>\d+)\s+dispatch_bytes=(?P<bytes>\d+)"
+)
+
+
+def parse_message_sizes(path: Path):
+    data = []
+    source_mode = None
+
+    for line in path.read_text().splitlines():
+        m = MSG_RE.search(line)
+        if m:
+            source_mode = "msg_bytes"
+            data.append((int(m.group("dst")), int(m.group("msg"))))
+
+    if data:
+        return data, source_mode
+
+    # Fallback for older logs: use per-post average when per-message logs are absent.
+    for line in path.read_text().splitlines():
+        m = POST_RE.search(line)
+        if not m:
+            continue
+        wrs = int(m.group("wrs"))
+        if wrs <= 0:
+            continue
+        source_mode = "dispatch_bytes/dispatch_wrs"
+        msg_size = int(round(int(m.group("bytes")) / wrs))
+        data.append((int(m.group("dst")), msg_size))
+
+    return data, source_mode
+
+
+def percentile(values, p):
+    arr = np.asarray(values, dtype=float)
+    return float(np.percentile(arr, p))
+
+
+def format_stats(values):
+    arr = np.asarray(values, dtype=float)
+    return {
+        "n": arr.size,
+        "min": int(arr.min()),
+        "p50": int(percentile(arr, 50)),
+        "p90": int(percentile(arr, 90)),
+        "max": int(arr.max()),
+        "mean": float(arr.mean()),
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Plot RDMA dispatch message-size breakdown from logs."
+    )
+    parser.add_argument(
+        "--w-batch",
+        default="ep/bench/logs/w_batch_data.txt",
+        help="Path to with-batch log file.",
+    )
+    parser.add_argument(
+        "--wo-batch",
+        default="ep/bench/logs/wo_batch_data.txt",
+        help="Path to without-batch log file.",
+    )
+    parser.add_argument(
+        "--out",
+        default="ep/bench/logs/dispatch_message_size_breakdown.png",
+        help="Output figure path.",
+    )
+    parser.add_argument(
+        "--bucket-size",
+        type=int,
+        default=14352,
+        help="Bucket width in bytes for message-size histogram.",
+    )
+    args = parser.parse_args()
+
+    w_path = Path(args.w_batch)
+    wo_path = Path(args.wo_batch)
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    series = {}
+    modes = {}
+    by_dst = {}
+    for name, path in [("w_batch", w_path), ("wo_batch", wo_path)]:
+        parsed, mode = parse_message_sizes(path)
+        if not parsed:
+            raise ValueError(f"No parseable dispatch entries found in {path}")
+        modes[name] = mode
+        sizes = [v for _, v in parsed]
+        series[name] = sizes
+        dst_map = defaultdict(list)
+        for dst, v in parsed:
+            dst_map[dst].append(v)
+        by_dst[name] = {d: float(np.mean(vs)) for d, vs in sorted(dst_map.items())}
+
+    # Bucketize by fixed byte range and render side-by-side bars.
+    bucket_size = max(1, int(args.bucket_size))
+    max_size = max(max(series["w_batch"]), max(series["wo_batch"]))
+    num_buckets = int(math.ceil(max_size / bucket_size))
+    bins = np.arange(0, (num_buckets + 1) * bucket_size + 1, bucket_size)
+    hist_wo, _ = np.histogram(series["wo_batch"], bins=bins)
+    hist_w, _ = np.histogram(series["w_batch"], bins=bins)
+    bucket_indices = np.arange(len(bins) - 1)
+    bar_width = 0.82
+    bucket_labels = [f"{int(bins[i])//1024}-{int(bins[i+1])//1024}KB" for i in range(len(bins) - 1)]
+
+    fig = plt.figure(figsize=(16, 10), dpi=160, constrained_layout=True)
+    gs = fig.add_gridspec(2, 2, height_ratios=[1, 1.1], hspace=0.35, wspace=0.2)
+    ax_top_left = fig.add_subplot(gs[0, 0])
+    ax_top_right = fig.add_subplot(gs[0, 1])
+    ax_bottom = fig.add_subplot(gs[1, :])
+
+    ax_top_left.bar(
+        bucket_indices,
+        hist_wo,
+        width=bar_width,
+        color="#ff7f0e",
+        alpha=0.85,
+    )
+    ax_top_left.set_title(f"without batch message-size buckets (total WR={len(series['wo_batch'])})")
+    ax_top_left.set_xlabel("message-size bucket")
+    ax_top_left.set_ylabel("count")
+    ax_top_left.set_xticks(bucket_indices)
+    ax_top_left.set_xticklabels(bucket_labels, rotation=30, ha="right")
+    ax_top_left.grid(axis="y", alpha=0.25)
+    ax_top_left.set_xlim(-0.5, len(bucket_labels) - 0.5)
+    ax_top_left.text(
+        0.98,
+        0.95,
+        f"Total WR: {len(series['wo_batch'])}",
+        transform=ax_top_left.transAxes,
+        ha="right",
+        va="top",
+        fontsize=14,
+        bbox={"boxstyle": "round,pad=0.25", "facecolor": "white", "alpha": 0.85, "edgecolor": "#cccccc"},
+    )
+
+    ax_top_right.bar(
+        bucket_indices, hist_w, width=bar_width, color="#1f77b4", alpha=0.85
+    )
+    ax_top_right.set_title(f"with batch message-size buckets (total WR={len(series['w_batch'])})")
+    ax_top_right.set_xlabel("message-size bucket")
+    ax_top_right.set_ylabel("count")
+    ax_top_right.set_xticks(bucket_indices)
+    ax_top_right.set_xticklabels(bucket_labels, rotation=30, ha="right")
+    ax_top_right.grid(axis="y", alpha=0.25)
+    ax_top_right.set_xlim(-0.5, len(bucket_labels) - 0.5)
+    ax_top_right.text(
+        0.98,
+        0.95,
+        f"Total WR: {len(series['w_batch'])}",
+        transform=ax_top_right.transAxes,
+        ha="right",
+        va="top",
+        fontsize=14,
+        bbox={"boxstyle": "round,pad=0.25", "facecolor": "white", "alpha": 0.85, "edgecolor": "#cccccc"},
+    )
+
+    all_dsts = sorted(set(by_dst["w_batch"]) | set(by_dst["wo_batch"]))
+    x = np.arange(len(all_dsts))
+    width = 0.38
+    vals_w = [by_dst["w_batch"].get(d, np.nan) for d in all_dsts]
+    vals_wo = [by_dst["wo_batch"].get(d, np.nan) for d in all_dsts]
+    ax_bottom.bar(x - width / 2, vals_wo, width=width, label="without_batch", color="#ff7f0e")
+    ax_bottom.bar(x + width / 2, vals_w, width=width, label="with_batch", color="#1f77b4")
+    ax_bottom.set_xticks(x)
+    ax_bottom.set_xticklabels([str(d) for d in all_dsts])
+    ax_bottom.set_title("Average message size by dst rank")
+    ax_bottom.set_xlabel("dst rank")
+    ax_bottom.set_ylabel("avg bytes/message")
+    ax_bottom.grid(axis="y", alpha=0.25)
+    ax_bottom.legend()
+
+    fig.suptitle(
+        f"Dispatch message-size breakdown (without batch WR={len(series['wo_batch'])}, with batch WR={len(series['w_batch'])})"
+    )
+    fig.savefig(out_path)
+
+    print(f"Saved figure: {out_path}")
+    for name in ["w_batch", "wo_batch"]:
+        st = format_stats(series[name])
+        print(
+            f"{name}: mode={modes[name]}, n={st['n']}, min={st['min']}, "
+            f"p50={st['p50']}, p90={st['p90']}, max={st['max']}, mean={st['mean']:.1f}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ep/bench/plots/w_batch_data.txt b/ep/bench/plots/w_batch_data.txt
new file mode 100644
index 000000000..046331948
--- /dev/null
+++ b/ep/bench/plots/w_batch_data.txt
@@ -0,0 +1,142 @@
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=100464
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=71760
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=71760
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=71760
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=57408
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=57408
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=71760
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=57408
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=71760
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=100464
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=43056
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=57408
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=57408
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=71760
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=71760
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=71760
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=71760
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=100464
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=71760
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=86112
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=86112
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=114816
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=71760
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=43056
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=129168
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=71760
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=71760
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=71760
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=57408
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=57408
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=86112
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=71760
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=43056
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=86112
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=71760
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=71760
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=86112
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=43056
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=57408
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=57408
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=57408
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=114816
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=71760
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=43056
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=28704
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=71760
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=86112
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=100464
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=100464
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=57408
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=57408
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=43056
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=86112
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=28704
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=86112
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=28704
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=71760
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=28704
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=71760
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=28704
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=86112
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=57408
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=86112
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=71760
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=28704
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=43056
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=43056
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=43056
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=43056
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=86112
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=43056
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=86112
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=57408
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=57408
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=100464
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=71760
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=43056
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=57408
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=57408
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=57408
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=57408
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=28704
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=43056
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=57408
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=28704
\ No newline at end of file
diff --git a/ep/bench/plots/wo_batch_data.txt b/ep/bench/plots/wo_batch_data.txt
new file mode 100644
index 000000000..780992cfe
--- /dev/null
+++ b/ep/bench/plots/wo_batch_data.txt
@@ -0,0 +1,505 @@
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=8 msg_bytes=14352
+[RDMA MSG dispatch] th=0 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=11 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=2 src=0 dst=14 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=3 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=15 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=13 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=12 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=10 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=9 msg_bytes=14352
+[RDMA MSG dispatch] th=1 src=0 dst=14 msg_bytes=14352
\ No newline at end of file

From 040313bd2c4f77e84cc246375a3d036a844e13ce Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:04:00 -0500
Subject: [PATCH 13/15] Implement RDMA post dispatch logging with debug option

---
 ep/src/rdma.cpp | 135 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/ep/src/rdma.cpp b/ep/src/rdma.cpp
index 273b4b7b3..679481305 100644
--- a/ep/src/rdma.cpp
+++ b/ep/src/rdma.cpp
@@ -38,6 +38,10 @@
 #include <sys/socket.h>
 #include <unistd.h>
 
+#ifndef UCCL_RDMA_POST_DEBUG
+#define UCCL_RDMA_POST_DEBUG 0
+#endif
+
 void recv_connection_info_as_server(int my_rank, int* actual_peer,
                                     int listen_fd,
                                     RDMAConnectionInfo* remote_array) {
@@ -732,6 +736,67 @@ void post_receive_buffer_for_imm(ProxyCtx& S) {
   }
 }
 
+static inline void log_dispatch_rdma_post(int thread_idx, int src_rank,
+                                          int dst_rank, size_t dispatch_wrs,
+                                          uint64_t dispatch_bytes,
+                                          size_t total_wrs,
+                                          uint64_t total_bytes) {
+#if UCCL_RDMA_POST_DEBUG
+  if (dispatch_wrs == 0) return;
+
+  static std::atomic<uint64_t> g_dispatch_post_count{0};
+  static std::atomic<uint64_t> g_dispatch_wr_count{0};
+  static std::atomic<uint64_t> g_dispatch_bytes_sum{0};
+  static std::once_flag g_dispatch_summary_once;
+  std::call_once(g_dispatch_summary_once, []() {
+    std::atexit([]() {
+      uint64_t post_count = g_dispatch_post_count.load(std::memory_order_relaxed);
+      uint64_t wr_count = g_dispatch_wr_count.load(std::memory_order_relaxed);
+      uint64_t bytes_sum = g_dispatch_bytes_sum.load(std::memory_order_relaxed);
+      if (post_count == 0) return;
+      fprintf(stderr,
+              "[RDMA POST dispatch summary] posts=%llu dispatch_wrs=%llu "
+              "dispatch_bytes=%llu\n",
+              (unsigned long long)post_count, (unsigned long long)wr_count,
+              (unsigned long long)bytes_sum);
+    });
+  });
+
+  g_dispatch_post_count.fetch_add(1, std::memory_order_relaxed);
+  g_dispatch_wr_count.fetch_add(dispatch_wrs, std::memory_order_relaxed);
+  g_dispatch_bytes_sum.fetch_add(dispatch_bytes, std::memory_order_relaxed);
+  fprintf(stderr,
+          "[RDMA POST dispatch-only] th=%d src=%d dst=%d dispatch_wrs=%zu "
+          "dispatch_bytes=%llu total_wrs=%zu total_bytes=%llu\n",
+          thread_idx, src_rank, dst_rank, dispatch_wrs,
+          (unsigned long long)dispatch_bytes, total_wrs,
+          (unsigned long long)total_bytes);
+#else
+  (void)thread_idx;
+  (void)src_rank;
+  (void)dst_rank;
+  (void)dispatch_wrs;
+  (void)dispatch_bytes;
+  (void)total_wrs;
+  (void)total_bytes;
+#endif
+}
+
+static inline void log_dispatch_rdma_message_size(int thread_idx, int src_rank,
+                                                  int dst_rank,
+                                                  uint32_t msg_bytes) {
+#if UCCL_RDMA_POST_DEBUG
+  fprintf(stderr,
+          "[RDMA MSG dispatch] th=%d src=%d dst=%d msg_bytes=%u\n", thread_idx,
+          src_rank, dst_rank, msg_bytes);
+#else
+  (void)thread_idx;
+  (void)src_rank;
+  (void)dst_rank;
+  (void)msg_bytes;
+#endif
+}
+
 // Normal mode implementation
 static void post_rdma_async_batched_normal_mode(
     ProxyCtx& S, void* buf, size_t num_wrs,
@@ -873,6 +938,22 @@ static void post_rdma_async_batched_normal_mode(
 
         ring_wrids.push_back(wrs_to_post[i]);
       }
+#if UCCL_RDMA_POST_DEBUG
+      uint64_t total_bytes = 0;
+      uint64_t dispatch_bytes = 0;
+      size_t dispatch_wrs = 0;
+      for (size_t i : idxs) {
+        total_bytes += cmds_to_post[i].bytes;
+        if (!get_is_combine(cmds_to_post[i].cmd_type)) {
+          dispatch_bytes += cmds_to_post[i].bytes;
+          ++dispatch_wrs;
+          log_dispatch_rdma_message_size(thread_idx, my_rank, dst_rank,
+                                         cmds_to_post[i].bytes);
+        }
+      }
+      log_dispatch_rdma_post(thread_idx, my_rank, dst_rank, dispatch_wrs,
+                             dispatch_bytes, idxs.size(), total_bytes);
+#endif
       int ret = ibv_wr_complete(qpx);
       if (ret) {
         fprintf(stderr, "ibv_wr_complete failed (dst=%d): %s (ret=%d)\n",
@@ -974,6 +1055,22 @@ static void post_rdma_async_batched_normal_mode(
         }
 
         // Post the chain
+#if UCCL_RDMA_POST_DEBUG
+        uint64_t total_bytes = 0;
+        uint64_t dispatch_bytes = 0;
+        size_t dispatch_wrs = 0;
+        for (size_t i : idxs) {
+          total_bytes += cmds_to_post[i].bytes;
+          if (!get_is_combine(cmds_to_post[i].cmd_type)) {
+            dispatch_bytes += cmds_to_post[i].bytes;
+            ++dispatch_wrs;
+            log_dispatch_rdma_message_size(thread_idx, my_rank, dst_rank,
+                                           cmds_to_post[i].bytes);
+          }
+        }
+        log_dispatch_rdma_post(thread_idx, my_rank, dst_rank, dispatch_wrs,
+                               dispatch_bytes, kgroup, total_bytes);
+#endif
         ibv_send_wr* bad = nullptr;
         int ret = ibv_post_send(qp, &wrs[0], &bad);
         if (ret) {
@@ -1083,6 +1180,22 @@ static void post_rdma_async_batched_normal_mode(
         }
 
         // Post the chain
+#if UCCL_RDMA_POST_DEBUG
+        uint64_t total_bytes = 0;
+        uint64_t dispatch_bytes = 0;
+        size_t dispatch_wrs = 0;
+        for (size_t i : idxs) {
+          total_bytes += cmds_to_post[i].bytes;
+          if (!get_is_combine(cmds_to_post[i].cmd_type)) {
+            dispatch_bytes += cmds_to_post[i].bytes;
+            ++dispatch_wrs;
+            log_dispatch_rdma_message_size(thread_idx, my_rank, dst_rank,
+                                           cmds_to_post[i].bytes);
+          }
+        }
+        log_dispatch_rdma_post(thread_idx, my_rank, dst_rank, dispatch_wrs,
+                               dispatch_bytes, kgroup, total_bytes);
+#endif
         ibv_send_wr* bad = nullptr;
         int ret = ibv_post_send(qp, &wrs[0], &bad);
         if (ret) {
@@ -1148,6 +1261,20 @@ static void post_rdma_async_batched_fast_mode(
       std::abort();
     }
     size_t const k = wr_ids.size();
+#if UCCL_RDMA_POST_DEBUG
+    uint64_t dst_total_bytes = 0;
+    for (size_t i : wr_ids) dst_total_bytes += cmds_to_post[i].bytes;
+    uint64_t dst_dispatch_bytes = 0;
+    size_t dst_dispatch_wrs = 0;
+    for (size_t i : wr_ids) {
+      if (!get_is_combine(cmds_to_post[i].cmd_type)) {
+        dst_dispatch_bytes += cmds_to_post[i].bytes;
+        ++dst_dispatch_wrs;
+        log_dispatch_rdma_message_size(thread_idx, my_rank, dst_rank,
+                                       cmds_to_post[i].bytes);
+      }
+    }
+#endif
 #ifdef EFA
     struct ibv_qp_ex* qpx = (struct ibv_qp_ex*)ctx->qp;
     ibv_wr_start(qpx);
@@ -1266,6 +1393,10 @@ static void post_rdma_async_batched_fast_mode(
     }
 #endif
 
+#if UCCL_RDMA_POST_DEBUG
+    log_dispatch_rdma_post(thread_idx, my_rank, dst_rank, dst_dispatch_wrs,
+                           dst_dispatch_bytes, k, dst_total_bytes);
+#endif
     int ret = ibv_wr_complete(qpx);
     if (ret) {
       fprintf(stderr, "ibv_wr_complete failed (dst=%d): %s (ret=%d)\n",
@@ -1317,6 +1448,10 @@ static void post_rdma_async_batched_fast_mode(
     wrs[last].send_flags |= IBV_SEND_SIGNALED;
     wrs[last].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
     wrs[last].imm_data = htonl(static_cast<uint32_t>(batch_tail_wr));
+#if UCCL_RDMA_POST_DEBUG
+    log_dispatch_rdma_post(thread_idx, my_rank, dst_rank, dst_dispatch_wrs,
+                           dst_dispatch_bytes, k, dst_total_bytes);
+#endif
     ibv_send_wr* bad = nullptr;
     int ret = ibv_post_send(ctx->qp, &wrs[0], &bad);
     if (ret) {

From 08daaea2f69853ab2b7a44f02852b449b7404e13 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:04:00 -0500
Subject: [PATCH 14/15] aligning the mesauremnet method to pplx to get
 apples-to-apples comparsin

---
 ep/bench/test_low_latency.py | 359 +++++++++++++++++++++++++++++------
 1 file changed, 297 insertions(+), 62 deletions(-)

diff --git a/ep/bench/test_low_latency.py b/ep/bench/test_low_latency.py
index 1064fda05..31c19da04 100644
--- a/ep/bench/test_low_latency.py
+++ b/ep/bench/test_low_latency.py
@@ -21,7 +21,6 @@
 import torch
 import torch.distributed as dist
 import numpy as np
-from functools import partial
 from typing import Optional
 
 
@@ -87,8 +86,6 @@ def mark_done():
 from utils import (
     init_dist,
     init_dist_under_torchrun,
-    bench,
-    bench_kineto,
     calc_diff,
     hash_tensor,
     per_token_cast_back,
@@ -150,6 +147,8 @@ def test_main(
     skip_benchmark: bool = False,
     debug_hash: bool = False,
     stop_after_first: bool = False,
+    num_warmup: int = 10000,
+    num_repeats: int = 10000,
 ):
     torch.manual_seed(seed + rank)
     random.seed(seed + rank)
@@ -470,40 +469,6 @@ def _record_hash(label: str, t: torch.Tensor, include_in_overall: bool = True):
             if stop_now:
                 break
 
-    # noinspection PyShadowingNames
-    def large_gemm_with_hook(hook):
-        mat_0 = torch.randn((8192, 8192), dtype=torch.float)
-        mat_1 = torch.randn((8192, 8192), dtype=torch.float)
-        mat_0 @ mat_1
-        hook()
-
-    # noinspection PyShadowingNames
-    def test_func(return_recv_hook: bool):
-        recv_x, recv_count, handle, event, hook = buffer.low_latency_dispatch(
-            current_x,
-            topk_idx,
-            num_tokens,
-            num_experts,
-            cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
-            use_fp8=dispatch_use_fp8,
-            async_finish=False,
-            return_recv_hook=return_recv_hook,
-        )
-        large_gemm_with_hook(hook) if return_recv_hook else None
-        # Phase alignment: ensure all ranks finish dispatch before any starts combine
-        dist.barrier(group=group)
-        combined_x, event, hook = buffer.low_latency_combine(
-            simulated_gemm_x,
-            topk_idx,
-            topk_weights,
-            handle,
-            use_logfmt=use_logfmt,
-            return_recv_hook=return_recv_hook,
-        )
-        large_gemm_with_hook(hook) if return_recv_hook else None
-        # Phase alignment: ensure all ranks finish dispatch before any starts combine
-        dist.barrier(group=group)
-
     print("✓ All correctness tests passed!", flush=True)
 
     # Lam: If stop_after_first, just run once to check the code
@@ -523,37 +488,291 @@ def test_func(return_recv_hook: bool):
             num_logfmt10_bytes if use_logfmt else num_bf16_bytes
         ) * num_selections
 
-    # Dispatch + combine testing
-    avg_t, min_t, max_t = bench(partial(test_func, return_recv_hook=False))
-    print(
-        f"[rank {rank}] Dispatch + combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, "
-        f"avg_t={avg_t * 1e6:.2f} us, min_t={min_t * 1e6:.2f} us, max_t={max_t * 1e6:.2f} us",
-        flush=True,
+    # Benchmark with the same timing structure as pplx/benchmarks/bench_all_to_all.py
+    out_dummy = torch.empty((1,), dtype=torch.float32, device="cuda")
+    gemm = torch.empty(
+        (2048, 2048) if num_tokens <= 128 else (8192, 8192),
+        dtype=torch.float32,
+        device="cuda",
     )
-    # Separate profiling
-    for return_recv_hook in (False, True):
-        dispatch_t, combine_t = bench_kineto(
-            partial(test_func, return_recv_hook=return_recv_hook),
-            kernel_names=("dispatch", "combine"),
-            barrier_comm_profiling=True,
-            suppress_kineto_output=True,
-            num_kernels_per_period=2 if return_recv_hook else 1,
+    rng = torch.Generator(device="cuda")
+    rng.manual_seed(rank + seed + 123)
+
+    pending_dispatch_hook = None
+    pending_combine_hook = None
+    pending_recv_x = None
+    pending_handle = None
+    bench_topk_idx = topk_idx
+
+    def wait():
+        # Same "wait" structure as bench_all_to_all.py
+        dist.all_reduce(out_dummy, group=group)
+        _ = gemm @ gemm
+        dist.all_reduce(out_dummy, group=group)
+
+    def _rand_topk_idx() -> torch.Tensor:
+        scores = torch.randn(
+            (num_tokens, num_experts),
+            dtype=torch.float32,
+            device="cuda",
+            generator=rng,
         )
-        # kineto profiling failed.
-        if dispatch_t == 0 or combine_t == 0:
-            continue
-        if not return_recv_hook:
-            print(
-                f"[rank {rank}] Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | "
-                f"Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us",
-                flush=True,
+        scores = scores.abs() + 1
+        return torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1]
+
+    def dispatch(do_send: bool, do_recv: bool):
+        nonlocal pending_dispatch_hook
+        nonlocal pending_recv_x
+        nonlocal pending_handle
+        if do_send:
+            recv_x, _, handle, _, hook = buffer.low_latency_dispatch(
+                current_x,
+                bench_topk_idx,
+                num_tokens,
+                num_experts,
+                cumulative_local_expert_recv_stats=None,
+                use_fp8=dispatch_use_fp8,
+                async_finish=False,
+                return_recv_hook=not do_recv,
             )
-        else:
+            if do_recv:
+                return recv_x, handle
+            pending_dispatch_hook = hook
+            pending_recv_x = recv_x
+            pending_handle = handle
+            return None, None
+        assert do_recv, "Invalid dispatch mode"
+        assert pending_dispatch_hook is not None
+        pending_dispatch_hook()
+        out = (pending_recv_x, pending_handle)
+        pending_dispatch_hook = None
+        pending_recv_x = None
+        pending_handle = None
+        return out
+
+    def materialize_for_combine(recv_x):
+        if dispatch_use_fp8:
+            return per_token_cast_back(
+                recv_x[0].view(-1, hidden),
+                recv_x[1].contiguous().view(-1, hidden // 128),
+            ).view(recv_x[0].shape)
+        return recv_x
+
+    def combine(simulated_x, handle, do_send: bool, do_recv: bool):
+        nonlocal pending_combine_hook
+        if do_send:
+            _, _, hook = buffer.low_latency_combine(
+                simulated_x,
+                bench_topk_idx,
+                topk_weights,
+                handle,
+                use_logfmt=use_logfmt,
+                return_recv_hook=not do_recv,
+            )
+            if not do_recv:
+                pending_combine_hook = hook
+            return
+        assert do_recv, "Invalid combine mode"
+        assert pending_combine_hook is not None
+        pending_combine_hook()
+        pending_combine_hook = None
+
+    events = []
+    for _ in range(num_warmup + num_repeats):
+        dispatch_start = torch.cuda.Event(enable_timing=True)
+        dispatch_end = torch.cuda.Event(enable_timing=True)
+        combine_start = torch.cuda.Event(enable_timing=True)
+        combine_end = torch.cuda.Event(enable_timing=True)
+        dispatch_send_start = torch.cuda.Event(enable_timing=True)
+        dispatch_send_end = torch.cuda.Event(enable_timing=True)
+        dispatch_recv_start = torch.cuda.Event(enable_timing=True)
+        dispatch_recv_end = torch.cuda.Event(enable_timing=True)
+        combine_send_start = torch.cuda.Event(enable_timing=True)
+        combine_send_end = torch.cuda.Event(enable_timing=True)
+        combine_recv_start = torch.cuda.Event(enable_timing=True)
+        combine_recv_end = torch.cuda.Event(enable_timing=True)
+        dispatch_start.record()
+        dispatch_end.record()
+        combine_start.record()
+        combine_end.record()
+        dispatch_send_start.record()
+        dispatch_send_end.record()
+        dispatch_recv_start.record()
+        dispatch_recv_end.record()
+        combine_send_start.record()
+        combine_send_end.record()
+        combine_recv_start.record()
+        combine_recv_end.record()
+        events.append(
+            (
+                dispatch_start,
+                dispatch_end,
+                combine_start,
+                combine_end,
+                dispatch_send_start,
+                dispatch_send_end,
+                dispatch_recv_start,
+                dispatch_recv_end,
+                combine_send_start,
+                combine_send_end,
+                combine_recv_start,
+                combine_recv_end,
+            )
+        )
+
+    last_report_time = time.time()
+    profiler_started = False
+    for i in range(num_warmup + num_repeats):
+        if i + 1 == num_warmup and num_warmup > 0:
+            torch.cuda.profiler.start()
+            profiler_started = True
+        now = time.time()
+        if rank == 0 and (
+            now - last_report_time > 1 or i + 1 == num_warmup + num_repeats
+        ):
             print(
-                f"[rank {rank}] Dispatch send/recv time: {dispatch_t[0] * 1e6:.2f} + {dispatch_t[1] * 1e6:.2f} us | "
-                f"Combine send/recv time: {combine_t[0] * 1e6:.2f} + {combine_t[1] * 1e6:.2f} us",
+                f"[rank 0] Iteration {i + 1}/{num_warmup + num_repeats}",
                 flush=True,
             )
+            last_report_time = now
+
+        (
+            dispatch_start,
+            dispatch_end,
+            combine_start,
+            combine_end,
+            dispatch_send_start,
+            dispatch_send_end,
+            dispatch_recv_start,
+            dispatch_recv_end,
+            combine_send_start,
+            combine_send_end,
+            combine_recv_start,
+            combine_recv_end,
+        ) = events[i]
+
+        bench_topk_idx = _rand_topk_idx()
+
+        # Send + recv back-to-back
+        wait()
+        dispatch_start.record()
+        recv_x, handle = dispatch(do_send=True, do_recv=True)
+        dispatch_end.record()
+        simulated_x = materialize_for_combine(recv_x)
+
+        wait()
+        combine_start.record()
+        combine(simulated_x, handle, do_send=True, do_recv=True)
+        combine_end.record()
+
+        # Send and recv split by long kernels
+        wait()
+        dispatch_send_start.record()
+        dispatch(do_send=True, do_recv=False)
+        dispatch_send_end.record()
+
+        wait()
+        dispatch_recv_start.record()
+        recv_x, handle = dispatch(do_send=False, do_recv=True)
+        dispatch_recv_end.record()
+        simulated_x = materialize_for_combine(recv_x)
+
+        wait()
+        combine_send_start.record()
+        combine(simulated_x, handle, do_send=True, do_recv=False)
+        combine_send_end.record()
+
+        wait()
+        combine_recv_start.record()
+        combine(None, None, do_send=False, do_recv=True)
+        combine_recv_end.record()
+
+    torch.cuda.synchronize()
+    if profiler_started:
+        torch.cuda.profiler.stop()
+
+    dispatch_times_us = []
+    dispatch_send_times_us = []
+    dispatch_recv_times_us = []
+    combine_times_us = []
+    combine_send_times_us = []
+    combine_recv_times_us = []
+    for (
+        dispatch_st,
+        dispatch_en,
+        combine_st,
+        combine_en,
+        dispatch_send_st,
+        dispatch_send_en,
+        dispatch_recv_st,
+        dispatch_recv_en,
+        combine_send_st,
+        combine_send_en,
+        combine_recv_st,
+        combine_recv_en,
+    ) in events[num_warmup:]:
+        dispatch_times_us.append(dispatch_st.elapsed_time(dispatch_en) * 1000.0)
+        combine_times_us.append(combine_st.elapsed_time(combine_en) * 1000.0)
+        dispatch_send_times_us.append(
+            dispatch_send_st.elapsed_time(dispatch_send_en) * 1000.0
+        )
+        dispatch_recv_times_us.append(
+            dispatch_recv_st.elapsed_time(dispatch_recv_en) * 1000.0
+        )
+        combine_send_times_us.append(
+            combine_send_st.elapsed_time(combine_send_en) * 1000.0
+        )
+        combine_recv_times_us.append(
+            combine_recv_st.elapsed_time(combine_recv_en) * 1000.0
+        )
+
+    gathered = [None for _ in range(num_ranks)]
+    dist.all_gather_object(gathered, dispatch_times_us, group=group)
+    dispatch_times_us = [v for per_rank in gathered for v in per_rank]
+    dist.all_gather_object(gathered, dispatch_send_times_us, group=group)
+    dispatch_send_times_us = [v for per_rank in gathered for v in per_rank]
+    dist.all_gather_object(gathered, dispatch_recv_times_us, group=group)
+    dispatch_recv_times_us = [v for per_rank in gathered for v in per_rank]
+    dist.all_gather_object(gathered, combine_times_us, group=group)
+    combine_times_us = [v for per_rank in gathered for v in per_rank]
+    dist.all_gather_object(gathered, combine_send_times_us, group=group)
+    combine_send_times_us = [v for per_rank in gathered for v in per_rank]
+    dist.all_gather_object(gathered, combine_recv_times_us, group=group)
+    combine_recv_times_us = [v for per_rank in gathered for v in per_rank]
+
+    def _p50(values):
+        return float(np.percentile(np.asarray(values), 50))
+
+    if rank == 0:
+        dispatch_p50_s = _p50(dispatch_times_us) / 1e6
+        combine_p50_s = _p50(combine_times_us) / 1e6
+        dispatch_bw = num_dispatch_comm_bytes / 1e9 / dispatch_p50_s
+        combine_bw = num_combine_comm_bytes / 1e9 / combine_p50_s
+
+        print(
+            f"[rank 0] Dispatch both p50: {_p50(dispatch_times_us):.2f} us, {dispatch_bw:.2f} GB/s",
+            flush=True,
+        )
+        print(
+            f"[rank 0] Dispatch send p50: {_p50(dispatch_send_times_us):.2f} us",
+            flush=True,
+        )
+        print(
+            f"[rank 0] Dispatch recv p50: {_p50(dispatch_recv_times_us):.2f} us",
+            flush=True,
+        )
+        print(
+            f"[rank 0] Combine both p50: {_p50(combine_times_us):.2f} us, {combine_bw:.2f} GB/s",
+            flush=True,
+        )
+        print(
+            f"[rank 0] Combine send p50: {_p50(combine_send_times_us):.2f} us",
+            flush=True,
+        )
+        print(
+            f"[rank 0] Combine recv p50: {_p50(combine_recv_times_us):.2f} us",
+            flush=True,
+        )
     return (hash_value, hash_details) if debug_hash else hash_value
 
 
@@ -595,6 +814,8 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
             skip_benchmark=args.pressure_test_mode == 1,
             debug_hash=args.debug_hash,
             stop_after_first=args.stop_after_first,
+            num_warmup=args.num_warmup,
+            num_repeats=args.num_repeats,
         )
         if args.debug_hash:
             ref_hash, ref_hash_details = ref_out
@@ -624,6 +845,8 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
                 skip_benchmark=args.pressure_test_mode == 1,
                 debug_hash=args.debug_hash,
                 stop_after_first=args.stop_after_first,
+                num_warmup=args.num_warmup,
+                num_repeats=args.num_repeats,
             )
             if args.debug_hash:
                 current_hash, current_hash_details = cur_out
@@ -727,6 +950,18 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         action="store_true",
         help="Stop after the first experiment (return_recv_hook=False, dispatch_use_fp8=False, round_scale=False, use_ue8m0=False).",
     )
+    parser.add_argument(
+        "--num-warmup",
+        type=int,
+        default=200,
+        help="Number of warmup iterations for event timing benchmark.",
+    )
+    parser.add_argument(
+        "--num-repeats",
+        type=int,
+        default=500,
+        help="Number of measured iterations for event timing benchmark.",
+    )
     args = parser.parse_args()
 
     num_processes = args.num_processes

From c9d4b63570bade868b84c5178b47489479ed0b82 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Fri, 20 Feb 2026 03:04:01 -0500
Subject: [PATCH 15/15] [unfinished commit] Add global counter for dst GPU
 batch, sending counter and receveier logic are todo

---
 ep/bench/launch_distributed_ll.sh |   7 +-
 ep/include/ep_config.hpp          |  19 ++-
 ep/src/internode_ll.cu            | 208 +++++++++++++++++++++++++-----
 3 files changed, 191 insertions(+), 43 deletions(-)

diff --git a/ep/bench/launch_distributed_ll.sh b/ep/bench/launch_distributed_ll.sh
index bb868e0cd..3c7ed4232 100755
--- a/ep/bench/launch_distributed_ll.sh
+++ b/ep/bench/launch_distributed_ll.sh
@@ -52,8 +52,8 @@ done
 
 # Node IPs (in order of node_rank)
 NODES=(
-    "172.31.16.74"   # master
-    "172.31.18.162"
+    "172.31.5.88"    # master (ip-172-31-5-88)
+    "172.31.6.215"   # ip-172-31-6-215
 )
 
 # Configuration - MASTER_ADDR automatically set to first node
@@ -219,7 +219,8 @@ export UCCL_LOG_DIR=${RUN_LOG_DIR}"
             --num-tokens=${NUM_TOKENS} \
             --hidden=${HIDDEN} \
             --num-topk=${NUM_TOPK} \
-            --num-experts=${NUM_EXPERTS}"
+            --num-experts=${NUM_EXPERTS} \
+            --stop-after-first"
         echo "[$(date '+%H:%M:%S')] Command to launch on node ${node_rank}:"
         echo "${conda_cmd}"
         ssh -o StrictHostKeyChecking=no \
diff --git a/ep/include/ep_config.hpp b/ep/include/ep_config.hpp
index 082fcdf29..aeb4b2880 100644
--- a/ep/include/ep_config.hpp
+++ b/ep/include/ep_config.hpp
@@ -197,15 +197,20 @@ struct LowLatencyLayout {
 
     // Send buffer
 #ifdef LAM_DEV
+    constexpr size_t kNumMaxTopK = 9;
     // Lam: Buffer layout for batched RDMA sends:
-    // ┌─────────────────────────────────┬──────────────────────────────────────────────┐
-    // │ Temp buffer (offset 0)          │ RDMA batch buffer (offset num_max_tokens)    │
-    // │ rdma_x[token_idx]               │ rdma_x[num_max_tokens + expert*max + slot]   │
-    // │ Size: num_max_tokens * msg_size │ Size: num_experts * num_max_tokens * msg_size│
-    // └─────────────────────────────────┴──────────────────────────────────────────────┘
-    // Flow: FP8 cast -> temp buffer -> copy to rdma_batch_buffer -> batch RDMA send
+    // ┌─────────────────────────────────┬──────────────────────────────────────────────┬──────────────────────────────────────────────┐
+    // │ Temp buffer (offset 0)          │ Expert batch buffer                          │ Rank batch buffer                            │
+    // │ rdma_x[token_idx]               │ rdma_x[num_max_tokens + expert*max + slot]   │ rdma_x[... + rank*rank_cap + rank_slot]      │
+    // │ Size: num_max_tokens * msg_size │ Size: num_experts * num_max_tokens * msg_size│ Size: num_ranks*max*kNumMaxTopK*msg_size     │
+    // └─────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────┘
+    // Flow: FP8 cast -> temp -> expert batch (legacy path) + rank batch (new path)
+    size_t rank_batch_capacity_per_rank =
+        num_max_dispatch_tokens_per_rank * kNumMaxTopK;
     size_t dispatch_send_buffer_bytes =
-        (num_experts + 1) * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+        ((1 + num_experts) * num_max_dispatch_tokens_per_rank +
+         num_ranks * rank_batch_capacity_per_rank) *
+        num_bytes_per_dispatch_msg;
 #else
     size_t dispatch_send_buffer_bytes =
         num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 3d74672a8..16ba7d5ae 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -64,6 +64,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
 #ifdef LAM_DEV
     int* atomic_send_counter_per_expert,
+    int* atomic_send_counter_per_rank, int* rank_send_prefix,
 #endif
     int* next_clean, int64_t* next_clean_second, int num_next_clean_int,
     int num_tokens, int num_max_dispatch_tokens_per_rank, int num_topk,
@@ -253,15 +254,80 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                 : 0;
         if (dst_p2p_ptr == 0) {
 #ifdef LAM_DEV
+          int lam_rank_slot = lane_id == 0
+                                  ? atomicAdd(atomic_send_counter_per_rank +
+                                                  dst_rank,
+                                              1)
+                                  : 0;
+          lam_rank_slot = __shfl_sync(WARP_MASK, lam_rank_slot, 0);
           // Lam: IBGDA -> copy temp to rdma_batch_buffer, batch send later
           auto const lam_slot = shared_send_slots[warp_id];
-          auto const batch_buf_offset = num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-          auto const batch_buf_ptr = static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-              (dst_expert_idx * num_max_dispatch_tokens_per_rank + lam_slot) * num_bytes_per_msg;
+          auto const rank_batch_capacity =
+              num_max_dispatch_tokens_per_rank * kNumMaxTopK;
+          EP_DEVICE_ASSERT(lam_rank_slot >= 0);
+          EP_DEVICE_ASSERT(lam_rank_slot < rank_batch_capacity);
+          auto const batch_buf_offset =
+              num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+          auto const rank_batch_buf_offset =
+              (1 + num_experts) * num_max_dispatch_tokens_per_rank *
+              num_bytes_per_msg;
+          auto const batch_buf_ptr =
+              static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+              (dst_expert_idx * num_max_dispatch_tokens_per_rank + lam_slot) *
+                  num_bytes_per_msg;
+          auto const rank_batch_buf_ptr =
+              static_cast<uint8_t*>(rdma_x) + rank_batch_buf_offset +
+              (dst_rank * rank_batch_capacity + lam_rank_slot) *
+                  num_bytes_per_msg;
           auto const* src_int4_ptr = reinterpret_cast<int4 const*>(rdma_x_src_idx);
           auto* batch_buf_int4_ptr = reinterpret_cast<int4*>(batch_buf_ptr);
+          auto* rank_batch_buf_int4_ptr =
+              reinterpret_cast<int4*>(rank_batch_buf_ptr);
           UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, batch_buf_int4_ptr,
                              src_int4_ptr, ld_nc_global, st_na_global);
+          // New path: write into per-dst-rank contiguous staging buffer.
+          UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, rank_batch_buf_int4_ptr,
+                             src_int4_ptr, ld_nc_global, st_na_global);
+          if (lane_id == 0) {
+            // Populate reserved header fields for future rank-batched recv
+            // demux:
+            //   [0] src_token_idx (already written),
+            //   [1] dst_local_expert_idx,
+            //   [2] dst_rank,
+            //   [3] rank_slot.
+            auto* batch_msg_header = reinterpret_cast<int*>(batch_buf_ptr);
+            batch_msg_header[1] = dst_expert_local_idx;
+            batch_msg_header[2] = dst_rank;
+            batch_msg_header[3] = lam_rank_slot;
+            auto* rank_batch_msg_header = reinterpret_cast<int*>(rank_batch_buf_ptr);
+            rank_batch_msg_header[1] = dst_expert_local_idx;
+            rank_batch_msg_header[2] = dst_rank;
+            rank_batch_msg_header[3] = lam_rank_slot;
+
+            // Full mapping print (rank0 only): print every inter-node mapped
+            // entry at write time, so we can inspect exact slot assignments.
+            if (rank == 0) {
+              while (atomicCAS(&g_print_lock, 0, 1) != 0)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+                __builtin_amdgcn_s_sleep(1);
+#else
+                ;
+#endif
+              printf(
+                  "[LAM_DEV][dispatch] write src_idx=%d dst_expert=%d "
+                  "dst_local=%d dst_rank=%d expert_slot=%d local_rank_slot=%d "
+                  "expert_buf_index=%d rank_buf_index=%d "
+                  "dst_rank_expert_begin=%d dst_rank_expert_end=%d\n",
+                  batch_msg_header[0], dst_expert_idx, batch_msg_header[1],
+                  batch_msg_header[2], lam_slot, batch_msg_header[3],
+                  dst_expert_idx * num_max_dispatch_tokens_per_rank + lam_slot,
+                  dst_rank * rank_batch_capacity + lam_rank_slot,
+                  dst_rank * num_local_experts,
+                  min((dst_rank + 1) * num_local_experts, num_experts) - 1);
+              __threadfence_system();
+              atomicExch(&g_print_lock, 0);
+            }
+          }
 #else
           __threadfence_system();
           uccl::nvshmemi_ibgda_put_nbi_warp(
@@ -349,14 +415,15 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   cg::this_grid().sync();
 #endif
 
-  // Lam: Batch RDMA send phase - send entire expert buffer in ONE IBGDA call
-  // Each warp group handles one expert (only first sub_warp does the send)
-  if (responsible_expert_idx < num_experts && sub_warp_id == 0) {
-    auto const dst_rank = responsible_expert_idx / num_local_experts;
-    auto const dst_expert_local_idx = responsible_expert_idx % num_local_experts;
+  // Lam: Batch RDMA send phase - send one rank-contiguous buffer per dst rank.
+  // Each warp group handles one destination rank (only first sub_warp sends).
+  if (responsible_expert_idx < num_ranks && sub_warp_id == 0) {
+    auto const dst_rank = responsible_expert_idx;
+    auto const rank_batch_capacity =
+        num_max_dispatch_tokens_per_rank * kNumMaxTopK;
 
     // Check if this destination is inter-node (needs IBGDA batch send)
-    // IPC destinations were already sent in the token loop
+    // IPC destinations were already sent in the token loop.
     auto const test_dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x);
     auto const dst_p2p_ptr =
         ipc_rdma_base_ptrs
@@ -365,41 +432,76 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
             : 0;
 
     if (dst_p2p_ptr == 0) {
-      // Inter-node: batch send ALL tokens for this expert in ONE call
-      auto const num_tokens_to_send =
-          atomic_send_counter_per_expert[responsible_expert_idx];
-
+      // Inter-node: batch send ALL tokens for this destination rank in ONE call.
+      auto const num_tokens_to_send = atomic_send_counter_per_rank[dst_rank];
       if (num_tokens_to_send > 0) {
-        auto const batch_buf_offset =
-            num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-        // Source: start of this expert's batch buffer (contiguous)
-        auto const batch_buf_ptr =
-            static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-            responsible_expert_idx * num_max_dispatch_tokens_per_rank *
-                num_bytes_per_msg;
-        auto const src_ptr = reinterpret_cast<uint64_t>(batch_buf_ptr);
-        // Destination: start of this expert's recv buffer on remote rank
+        auto const rank_batch_buf_offset =
+            (1 + num_experts) * num_max_dispatch_tokens_per_rank *
+            num_bytes_per_msg;
+        // Source: start of this dst-rank's rank-batch buffer (contiguous).
+        auto const rank_batch_buf_ptr =
+            static_cast<uint8_t*>(rdma_x) + rank_batch_buf_offset +
+            dst_rank * rank_batch_capacity * num_bytes_per_msg;
+        auto const src_ptr = reinterpret_cast<uint64_t>(rank_batch_buf_ptr);
+        auto const rank_batch_recv_offset =
+            num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        // Destination: start of this src-rank's slot in remote rank-batch area.
         auto const dst_ptr =
             reinterpret_cast<uint64_t>(rdma_recv_x) +
-            dst_expert_local_idx * num_ranks *
-                num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
-            rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-        // Total bytes: all tokens for this expert
+            rank_batch_recv_offset +
+            rank * rank_batch_capacity * num_bytes_per_msg;
         auto const total_bytes = num_tokens_to_send * num_bytes_per_msg;
 
+#ifdef LAM_DEV
+        if (rank == 0 && lane_id == 0) {
+          while (atomicCAS(&g_print_lock, 0, 1) != 0)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+            __builtin_amdgcn_s_sleep(1);
+#else
+            ;
+#endif
+          printf(
+              "[LAM_DEV][dispatch] send_ready src_rank=%d dst_rank=%d "
+              "tokens=%d bytes=%llu rank_capacity=%d src_rank_base_idx=%d\n",
+              rank, dst_rank, num_tokens_to_send,
+              static_cast<unsigned long long>(total_bytes), rank_batch_capacity,
+              dst_rank * rank_batch_capacity);
+          __threadfence_system();
+          atomicExch(&g_print_lock, 0);
+        }
+#endif
+
         __threadfence_system();
 
         uccl::nvshmemi_ibgda_put_nbi_warp(
             dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
             src_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
             total_bytes, dst_rank,
-            /*warp_id=*/dst_expert_local_idx,  // NOTE(Yang): for selecting rb.
+            /*warp_id=*/
+            dst_rank % ((num_local_experts > 0) ? num_local_experts : 1),
             lane_id, /*slot=*/0, d2h_channel_addrs, num_d2h_channel_addrs,
             false, low_latency_buffer_idx, 0, 0, num_tokens_to_send);
-        
+
+#ifdef LAM_DEV
+        if (rank == 0 && lane_id == 0) {
+          while (atomicCAS(&g_print_lock, 0, 1) != 0)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+            __builtin_amdgcn_s_sleep(1);
+#else
+            ;
+#endif
+          printf(
+              "[LAM_DEV][dispatch] send_done src_rank=%d dst_rank=%d "
+              "tokens=%d bytes=%llu\n",
+              rank, dst_rank, num_tokens_to_send,
+              static_cast<unsigned long long>(total_bytes));
+          __threadfence_system();
+          atomicExch(&g_print_lock, 0);
+        }
+#endif
       }
     }
-    // IPC: already sent in the token loop, nothing to do here
+    // IPC: already sent in the token loop, nothing to do here.
   }
 
   __threadfence_system();  // Ensure batch sends are visible before count sends
@@ -454,6 +556,10 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
 
 #ifdef LAM_DEV
     atomic_send_counter_per_expert[responsible_expert_idx] = 0;
+    if (responsible_expert_idx < num_ranks) {
+      atomic_send_counter_per_rank[responsible_expert_idx] = 0;
+      rank_send_prefix[responsible_expert_idx] = 0;
+    }
 #endif
     // Clean `packed_recv_count`
     if (dst_rank == 0) packed_recv_count[dst_expert_local_idx] = 0;
@@ -589,10 +695,40 @@ LOW_LATENCY_DISPATCH_RECV:
 
     // Copy tokens
     EP_DEVICE_ASSERT(num_scales <= 64);
+    auto const rank_batch_capacity =
+        num_max_dispatch_tokens_per_rank * kNumMaxTopK;
+    auto const rank_batch_recv_offset =
+        num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+    auto const rank_batch_recv_x_uint8 =
+        static_cast<uint8_t*>(rdma_recv_x) +
+        rank_batch_recv_offset +
+        src_rank * rank_batch_capacity * num_bytes_per_msg;
+    int inter_rank_scan_cursor = 0;
     for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) {
       // Copy source info
-      auto const src_src_idx =
-          reinterpret_cast<int*>(rdma_recv_x_uint8 + i * num_bytes_per_msg);
+      int matched_rank_slot = i;
+      if (src_rank / max_nvl_peers != rank / max_nvl_peers) {
+        if (lane_id == 0) {
+          matched_rank_slot = -1;
+          for (int s = inter_rank_scan_cursor; s < rank_batch_capacity; ++s) {
+            auto const* hdr = reinterpret_cast<int const*>(
+                rank_batch_recv_x_uint8 + s * num_bytes_per_msg);
+            auto const dst_local_expert = ld_nc_global(hdr + 1);
+            if (dst_local_expert == local_expert_idx) {
+              matched_rank_slot = s;
+              inter_rank_scan_cursor = s + 1;
+              break;
+            }
+          }
+          EP_DEVICE_ASSERT(matched_rank_slot >= 0);
+        }
+        matched_rank_slot = __shfl_sync(WARP_MASK, matched_rank_slot, 0);
+      }
+      auto const src_src_idx = reinterpret_cast<int*>(
+          (src_rank / max_nvl_peers == rank / max_nvl_peers)
+              ? (rdma_recv_x_uint8 + i * num_bytes_per_msg)
+              : (rank_batch_recv_x_uint8 +
+                 matched_rank_slot * num_bytes_per_msg));
       if (lane_id == 0)
         recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx);
       __syncwarp();
@@ -672,8 +808,12 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
 #ifdef LAM_DEV
   auto atomic_send_counter_per_expert =
       atomic_finish_counter_per_expert + num_experts;
-  auto grid_sync_barrier_ptr = atomic_send_counter_per_expert + num_experts;
-  EP_HOST_ASSERT((num_experts * 3 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
+  auto atomic_send_counter_per_rank =
+      atomic_send_counter_per_expert + num_experts;
+  auto rank_send_prefix = atomic_send_counter_per_rank + num_ranks;
+  auto grid_sync_barrier_ptr = rank_send_prefix + num_ranks;
+  EP_HOST_ASSERT((num_experts * 3 + num_ranks * 2 + 1) * sizeof(int) <=
+                 NUM_WORKSPACE_BYTES);
 #else
   auto grid_sync_barrier_ptr = atomic_finish_counter_per_expert + num_experts;
   EP_HOST_ASSERT((num_experts * 2 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
@@ -696,6 +836,8 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         rdma_recv_x, rdma_recv_count, rdma_x, x, topk_idx,                    \
         atomic_counter_per_expert, atomic_finish_counter_per_expert,          \
         LAM_DEV_ARG(atomic_send_counter_per_expert)                         \
+        LAM_DEV_ARG(atomic_send_counter_per_rank)                           \
+        LAM_DEV_ARG(rank_send_prefix)                                        \
         next_clean, next_clean_second, num_next_clean_int, num_tokens,        \
         num_max_dispatch_tokens_per_rank, num_topk, num_experts, rank,        \
         num_ranks, num_warp_groups, num_warps_per_group, round_scale, phases, \