Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 62 additions & 8 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,13 @@ build_p2p() {
else
echo "[container] USE_TCPX=1, skipping copying p2p runtime files"
fi
if [[ "$TARGET" == rocm* ]]; then
cd thirdparty/dietgpu
rm -rf build/
python3 setup.py build
cd ../..
cp thirdparty/dietgpu/build/**/*.so uccl/
fi
}

build_ep() {
Expand Down Expand Up @@ -180,18 +187,17 @@ build_eccl() {
set -euo pipefail
echo "[container] build_eccl Target: $TARGET"

cd eccl
cd experimental/eccl
if [[ "$TARGET" == cuda* ]]; then
echo "Skipping eccl build on Cuda."
return
make clean -f Makefile && make -j$(nproc) -f Makefile
elif [[ "$TARGET" == rocm* ]]; then
make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
fi
cd ..
cd ../..

echo "[container] Copying eccl .so to uccl/"
# mkdir -p uccl/lib
# cp eccl/eccl.*.so uccl/
mkdir -p uccl/lib # mkdir anyway
cp experimental/eccl/*eccl*.so uccl/lib
}

# Determine the Docker image to use based on the target and architecture
Expand Down Expand Up @@ -250,6 +256,44 @@ else
fi

echo "[2/3] Running build inside container..."

# Auto-detect CUDA architecture for ep build
DETECTED_GPU_ARCH=""
if [[ "$BUILD_TYPE" =~ (ep|all|p2p) ]];then
if [[ "$TARGET" == cuda* ]] && command -v nvidia-smi &> /dev/null; then
DETECTED_GPU_ARCH="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 | tr -d ' ' || true)"

if [[ -n "$DETECTED_GPU_ARCH" ]]; then
echo "Auto-detected CUDA compute capability: ${DETECTED_GPU_ARCH}"
fi
elif [[ "$TARGET" == rocm* ]] && command -v amd-smi &> /dev/null; then
# Check if jq is installed, install via pip if not
if ! command -v jq &> /dev/null; then
echo "jq not found, installing via pip..."
pip install jq
fi
DETECTED_GPU_ARCH="$(
PYTHONWARNINGS=ignore \
amd-smi static -g 0 --asic --json 2>/dev/null \
| jq -r '
if .gpu_data and (.gpu_data | length > 0) then
.gpu_data[0].asic.target_graphics_version
else
empty
end
' \
|| true
)"
if [[ -n "$DETECTED_GPU_ARCH" ]]; then
echo "Auto-detected ROCm architecture: ${DETECTED_GPU_ARCH}"
fi
else
echo "[INFO] No compatible GPU detection tool found, skipping auto-detect"
fi
fi

export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-${DETECTED_GPU_ARCH}}"

docker run --rm --user "$(id -u):$(id -g)" \
-v /etc/passwd:/etc/passwd:ro \
-v /etc/group:/etc/group:ro \
Expand All @@ -263,6 +307,8 @@ docker run --rm --user "$(id -u):$(id -g)" \
-e WHEEL_DIR="${WHEEL_DIR}" \
-e BUILD_TYPE="${BUILD_TYPE}" \
-e USE_TCPX="${USE_TCPX:-0}" \
-e USE_EFA="${USE_EFA:-0}" \
-e USE_IB="${USE_IB:-0}" \
-e MAKE_NORMAL_MODE="${MAKE_NORMAL_MODE:-}" \
-e FUNCTION_DEF="$(declare -f build_rccl_nccl_h build_rdma build_efa build_p2p build_ep build_eccl)" \
-w /io \
Expand Down Expand Up @@ -346,7 +392,15 @@ def initialize():
mv ${BACKUP_FN} setup.py
fi

auditwheel repair dist/uccl-*.whl --exclude "libtorch*.so" --exclude "libc10*.so" --exclude "libibverbs.so.1" --exclude "libcudart.so.12" --exclude "libamdhip64.so.*" --exclude "libcuda.so.1" -w /io/${WHEEL_DIR}
auditwheel repair dist/uccl-*.whl \
--exclude "libtorch*.so" \
--exclude "libc10*.so" \
--exclude "libibverbs.so.1" \
--exclude "libcudart.so.12" \
--exclude "libamdhip64.so.*" \
--exclude "libcuda.so.1" \
--exclude "libefa.so.1" \
-w /io/${WHEEL_DIR}

# Add backend tag to wheel filename using local version identifier
if [[ "$TARGET" == rocm* || "$TARGET" == "therock" ]]; then
Expand Down Expand Up @@ -378,4 +432,4 @@ def initialize():

# 3. Done
echo "[3/3] Wheel built successfully (stored in ${WHEEL_DIR}):"
ls -lh "${WHEEL_DIR}"/uccl-*.whl || true
ls -lh "${WHEEL_DIR}"/uccl-*.whl || true
13 changes: 13 additions & 0 deletions build_and_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,16 @@ else
# That (currently) requires --extra-index-url
pip install --extra-index-url ${ROCM_IDX_URL} $(ls wheelhouse-$TARGET/uccl-*.whl)[rocm]
fi

UCCL_INSTALL_PATH=$(pip show uccl 2>/dev/null | grep "^Location:" | cut -d' ' -f2 || echo "")
if [[ -n "$UCCL_INSTALL_PATH" && -d "$UCCL_INSTALL_PATH" ]]; then
UCCL_PACKAGE_PATH="$UCCL_INSTALL_PATH/uccl"
if [[ -d "$UCCL_PACKAGE_PATH" ]]; then
echo "UCCL installed at: $UCCL_PACKAGE_PATH"
echo "Set LIBRARY_PATH: export LIBRARY_PATH=\"$UCCL_PACKAGE_PATH/lib:\$LIBRARY_PATH\""
else
echo "UCCL package directory not found at: $UCCL_PACKAGE_PATH"
fi
else
echo "Warning: Could not detect UCCL installation path"
fi
8 changes: 4 additions & 4 deletions docker/Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential cmake git ninja-build g++ make patchelf \
rdma-core libibverbs-dev \
rdma-core libibverbs-dev libnuma-dev \
libgoogle-glog-dev libgflags-dev libgtest-dev libelf-dev \
libnuma-dev libdrm-dev libdrm-amdgpu1 \
pkg-config zlib1g-dev curl \
software-properties-common && \
pkg-config zlib1g-dev curl unzip \
software-properties-common \
hipcub && \
\
# ───── Add Python ${PY_VER} PPA & install Python ${PY_VER} + setuptools ─────
add-apt-repository ppa:deadsnakes/ppa && \
Expand Down
32 changes: 32 additions & 0 deletions ep/bench/run_ep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# !/bin/bash

NNODES=${1:-2}
RANK=${2:-0}
MODE=${3:-ll} # ll, ht, ll-pressure, ht-pressure
MAIN_IP=${4:-10.1.18.53}

export OMP_NUM_THREADS=6
echo "Running nodes $NNODES, rank $RANK, mode $MODE, main IP $MAIN_IP"

if [ "$MODE" = "ll" ]; then
torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
--master_addr=$MAIN_IP --master_port=12355 \
test_low_latency.py --num-tokens=128 \
--hidden=7168 --num-topk=8 --num-experts=288 --pressure-test-mode=2
elif [ "$MODE" = "ht" ]; then
torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
--master_addr=$MAIN_IP --master_port=12355 \
test_internode.py --num-tokens=4096 \
--hidden=7168 --num-topk=8 --num-experts=288 --test-ll-compatibility
elif [ "$MODE" = "ll-pressure" ]; then
torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
--master_addr=$MAIN_IP --master_port=12355 \
test_low_latency.py --num-tokens=128 \
--hidden=7168 --num-topk=8 --num-experts=288 --pressure-test-mode=2 --debug-hash
elif [ "$MODE" = "ht-pressure" ]; then
torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
--master_addr=$MAIN_IP --master_port=12355 \
test_internode.py --num-tokens=4096 \
--hidden=7168 --num-topk=8 --num-experts=288 --test-ll-compatibility --pressure-test-mode=2
fi
# --log-dir=logs --redirect=3
Loading