uccl-project · MaoZiming · Jan 20, 2026
diff --git a/build.sh b/build.sh
@@ -143,6 +143,13 @@ build_p2p() {
   else
     echo "[container] USE_TCPX=1, skipping copying p2p runtime files"
   fi
+  if [[ "$TARGET" == rocm* ]]; then
+    cd thirdparty/dietgpu
+    rm -rf build/
+    python3 setup.py build
+    cd ../..
+    cp thirdparty/dietgpu/build/**/*.so uccl/
+  fi
 }
 
 build_ep() {
@@ -180,18 +187,17 @@ build_eccl() {
   set -euo pipefail
   echo "[container] build_eccl Target: $TARGET"
 
-  cd eccl
+  cd experimental/eccl
   if [[ "$TARGET" == cuda* ]]; then
-    echo "Skipping eccl build on Cuda."
-    return
+    make clean -f Makefile && make -j$(nproc) -f Makefile
   elif [[ "$TARGET" == rocm* ]]; then
     make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
   fi
-  cd ..
+  cd ../..
 
   echo "[container] Copying eccl .so to uccl/"
-  # mkdir -p uccl/lib
-  # cp eccl/eccl.*.so uccl/
+  mkdir -p uccl/lib # mkdir anyway
+  cp experimental/eccl/*eccl*.so uccl/lib
 }
 
 # Determine the Docker image to use based on the target and architecture
@@ -250,6 +256,44 @@ else
 fi
 
 echo "[2/3] Running build inside container..."
+
+# Auto-detect CUDA architecture for ep build
+DETECTED_GPU_ARCH=""
+if [[ "$BUILD_TYPE" =~ (ep|all|p2p) ]];then
+  if [[ "$TARGET" == cuda* ]] && command -v nvidia-smi &> /dev/null; then
+    DETECTED_GPU_ARCH="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 | tr -d ' ' || true)"
+
+    if [[ -n "$DETECTED_GPU_ARCH" ]]; then
+      echo "Auto-detected CUDA compute capability: ${DETECTED_GPU_ARCH}"
+    fi
+  elif [[ "$TARGET" == rocm* ]] && command -v amd-smi &> /dev/null; then
+    # Check if jq is installed, install via pip if not
+    if ! command -v jq &> /dev/null; then
+      echo "jq not found, installing via pip..."
+      pip install jq
+    fi
+    DETECTED_GPU_ARCH="$(
+      PYTHONWARNINGS=ignore \
+      amd-smi static -g 0 --asic --json 2>/dev/null \
+      | jq -r '
+          if .gpu_data and (.gpu_data | length > 0) then
+            .gpu_data[0].asic.target_graphics_version
+          else
+            empty
+          end
+        ' \
+      || true
+    )"
+      if [[ -n "$DETECTED_GPU_ARCH" ]]; then
+        echo "Auto-detected ROCm architecture: ${DETECTED_GPU_ARCH}"
+    fi
+  else
+    echo "[INFO] No compatible GPU detection tool found, skipping auto-detect"
+  fi
+fi
+
+export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-${DETECTED_GPU_ARCH}}"
+
 docker run --rm --user "$(id -u):$(id -g)" \
   -v /etc/passwd:/etc/passwd:ro \
   -v /etc/group:/etc/group:ro \
@@ -263,6 +307,8 @@ docker run --rm --user "$(id -u):$(id -g)" \
   -e WHEEL_DIR="${WHEEL_DIR}" \
   -e BUILD_TYPE="${BUILD_TYPE}" \
   -e USE_TCPX="${USE_TCPX:-0}" \
+  -e USE_EFA="${USE_EFA:-0}" \
+  -e USE_IB="${USE_IB:-0}" \
   -e MAKE_NORMAL_MODE="${MAKE_NORMAL_MODE:-}" \
   -e FUNCTION_DEF="$(declare -f build_rccl_nccl_h build_rdma build_efa build_p2p build_ep build_eccl)" \
   -w /io \
@@ -346,7 +392,15 @@ def initialize():
       mv ${BACKUP_FN} setup.py
     fi
 
-    auditwheel repair dist/uccl-*.whl --exclude "libtorch*.so" --exclude "libc10*.so" --exclude "libibverbs.so.1" --exclude "libcudart.so.12" --exclude "libamdhip64.so.*" --exclude "libcuda.so.1" -w /io/${WHEEL_DIR}
+    auditwheel repair dist/uccl-*.whl \
+      --exclude "libtorch*.so" \
+      --exclude "libc10*.so" \
+      --exclude "libibverbs.so.1" \
+      --exclude "libcudart.so.12" \
+      --exclude "libamdhip64.so.*" \
+      --exclude "libcuda.so.1" \
+      --exclude "libefa.so.1" \
+      -w /io/${WHEEL_DIR}
 
     # Add backend tag to wheel filename using local version identifier
     if [[ "$TARGET" == rocm* || "$TARGET" == "therock" ]]; then
@@ -378,4 +432,4 @@ def initialize():
 
 # 3. Done
 echo "[3/3] Wheel built successfully (stored in ${WHEEL_DIR}):"
-ls -lh "${WHEEL_DIR}"/uccl-*.whl || true
+ls -lh "${WHEEL_DIR}"/uccl-*.whl || true
diff --git a/build_and_install.sh b/build_and_install.sh
@@ -24,3 +24,16 @@ else
   # That (currently) requires --extra-index-url
   pip install --extra-index-url ${ROCM_IDX_URL} $(ls wheelhouse-$TARGET/uccl-*.whl)[rocm]
 fi
+
+UCCL_INSTALL_PATH=$(pip show uccl 2>/dev/null | grep "^Location:" | cut -d' ' -f2 || echo "")
+if [[ -n "$UCCL_INSTALL_PATH" && -d "$UCCL_INSTALL_PATH" ]]; then
+  UCCL_PACKAGE_PATH="$UCCL_INSTALL_PATH/uccl"
+  if [[ -d "$UCCL_PACKAGE_PATH" ]]; then
+    echo "UCCL installed at: $UCCL_PACKAGE_PATH"
+    echo "Set LIBRARY_PATH: export LIBRARY_PATH=\"$UCCL_PACKAGE_PATH/lib:\$LIBRARY_PATH\""
+  else
+    echo "UCCL package directory not found at: $UCCL_PACKAGE_PATH"
+  fi
+else
+  echo "Warning: Could not detect UCCL installation path"
+fi
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -11,11 +11,11 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential cmake git ninja-build g++ make patchelf \
-        rdma-core libibverbs-dev \
+        rdma-core libibverbs-dev libnuma-dev \
         libgoogle-glog-dev libgflags-dev libgtest-dev libelf-dev \
-        libnuma-dev libdrm-dev libdrm-amdgpu1 \
-        pkg-config zlib1g-dev curl \
-        software-properties-common && \
+        pkg-config zlib1g-dev curl unzip \
+        software-properties-common \
+        hipcub && \
 \
 # ───── Add Python ${PY_VER} PPA & install Python ${PY_VER} + setuptools ─────
     add-apt-repository ppa:deadsnakes/ppa && \

diff --git a/ep/bench/run_ep.sh b/ep/bench/run_ep.sh
@@ -0,0 +1,32 @@
+# !/bin/bash
+
+NNODES=${1:-2}
+RANK=${2:-0}
+MODE=${3:-ll} # ll, ht, ll-pressure, ht-pressure
+MAIN_IP=${4:-10.1.18.53}
+
+export OMP_NUM_THREADS=6
+echo "Running nodes $NNODES, rank $RANK, mode $MODE, main IP $MAIN_IP"
+
+if [ "$MODE" = "ll" ]; then
+    torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
+        --master_addr=$MAIN_IP --master_port=12355 \
+        test_low_latency.py --num-tokens=128 \
+        --hidden=7168 --num-topk=8 --num-experts=288 --pressure-test-mode=2
+elif [ "$MODE" = "ht" ]; then
+    torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
+        --master_addr=$MAIN_IP --master_port=12355 \
+        test_internode.py  --num-tokens=4096 \
+        --hidden=7168 --num-topk=8 --num-experts=288 --test-ll-compatibility
+elif [ "$MODE" = "ll-pressure" ]; then
+    torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
+        --master_addr=$MAIN_IP --master_port=12355 \
+        test_low_latency.py --num-tokens=128 \
+        --hidden=7168 --num-topk=8 --num-experts=288 --pressure-test-mode=2 --debug-hash
+elif [ "$MODE" = "ht-pressure" ]; then
+    torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
+        --master_addr=$MAIN_IP --master_port=12355 \
+        test_internode.py  --num-tokens=4096 \
+        --hidden=7168 --num-topk=8 --num-experts=288 --test-ll-compatibility --pressure-test-mode=2
+fi
+# --log-dir=logs --redirect=3