Skip to content

Commit

Permalink
Use new UCX images in devcontainers (#308)
Browse files Browse the repository at this point in the history
* Use new UCX images in devcontainers, add CUDA 12.4 devcontainers, test CUDA 11.8, 12.0, 12.2, and 12.4 in CI

* set git-lfs feature's autoPull arg to false

* remove 12.4 devcontainers and revert to building CUDA 12.0 and 12.2 in CI

* always add origin and upstream remotes

* support defining envvars in manifest.yaml for cpp libs

* build multiple archs in parallel in CI

* temporarily use my rapids-cmake fork with a fix for nvtx3

* build up to 6 archs at a time

* bump up raft device obj memory usage

* default max_archs to the inferred n_arch

* default to compiling RAFT, set MAX_DEVICE_OBJ_MEMORY_USAGE=3 for cuvs

* use rapids-cmake branch-24.06 again

* fix divide by zero
  • Loading branch information
trxcllnt authored May 24, 2024
1 parent 7d6ca6e commit c76e7ca
Show file tree
Hide file tree
Showing 20 changed files with 80 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/cuda11.8-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
15 changes: 10 additions & 5 deletions .devcontainer/cuda11.8-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,28 @@
"args": {
"CUDA": "11.8",
"PYTHON_PACKAGE_MANAGER": "pip",
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ucx1.15.0-openmpi-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
"./features/src/ucx": {"version": "1.15.0"},
"./features/src/cuda": {"version": "11.8", "installcuBLAS": true, "installcuDNN": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true},
"./features/src/cuda": {
"version": "11.8",
"installcuBLAS": true,
"installcuDNN": true,
"installcuSOLVER": true,
"installcuRAND": true,
"installcuSPARSE": true
},
"./features/src/utils": {},
"./features/src/rapids-build-utils": {}
},
"overrideFeatureInstallOrder": [
"./features/src/ucx",
"./features/src/cuda",
"./features/src/utils",
"./features/src/rapids-build-utils"
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/cuda12.0-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.0-conda"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.0-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
15 changes: 10 additions & 5 deletions .devcontainer/cuda12.0-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,28 @@
"args": {
"CUDA": "12.0",
"PYTHON_PACKAGE_MANAGER": "pip",
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.0-ubuntu22.04"
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.0-ucx1.15.0-openmpi-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.0-pip"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.0-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
"./features/src/ucx": {"version": "1.15.0"},
"./features/src/cuda": {"version": "12.0", "installcuBLAS": true, "installcuDNN": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true},
"./features/src/cuda": {
"version": "12.0",
"installcuBLAS": true,
"installcuDNN": true,
"installcuSOLVER": true,
"installcuRAND": true,
"installcuSPARSE": true
},
"./features/src/utils": {},
"./features/src/rapids-build-utils": {}
},
"overrideFeatureInstallOrder": [
"./features/src/ucx",
"./features/src/cuda",
"./features/src/utils",
"./features/src/rapids-build-utils"
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/cuda12.2-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
15 changes: 10 additions & 5 deletions .devcontainer/cuda12.2-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,28 @@
"args": {
"CUDA": "12.2",
"PYTHON_PACKAGE_MANAGER": "pip",
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
"BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ucx1.15.0-openmpi-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
"./features/src/ucx": {"version": "1.15.0"},
"./features/src/cuda": {"version": "12.2", "installcuBLAS": true, "installcuDNN": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true},
"./features/src/cuda": {
"version": "12.2",
"installcuBLAS": true,
"installcuDNN": true,
"installcuSOLVER": true,
"installcuRAND": true,
"installcuSPARSE": true
},
"./features/src/utils": {},
"./features/src/rapids-build-utils": {}
},
"overrideFeatureInstallOrder": [
"./features/src/ucx",
"./features/src/cuda",
"./features/src/utils",
"./features/src/rapids-build-utils"
Expand Down
19 changes: 11 additions & 8 deletions .github/workflows/build-all-rapids-repos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,16 @@ jobs:
sccache -z;
sccache --show-adv-stats;
clone-all -j$(nproc) -v -q --clone-upstream --single-branch --shallow-submodules;
build-all \
-v \
-j$(nproc --ignore=1) \
-DBUILD_TESTS=ON \
-DBUILD_BENCHMARKS=ON \
-DBUILD_ANN_BENCH=ON \
-DBUILD_PRIMS_BENCH=ON \
-DBUILD_SHARED_LIBS=ON \
INFER_NUM_DEVICE_ARCHITECTURES=1 \
build-all \
-v \
-j$(nproc --ignore=1) \
-DBUILD_SHARED_LIBS=ON \
-DBUILD_TESTS=ON \
-DBUILD_BENCHMARKS=ON \
-DBUILD_ANN_BENCH=ON \
-DBUILD_PRIMS_BENCH=ON \
-DRAFT_COMPILE_LIBRARY=ON \
-DBUILD_CUGRAPH_MG_TESTS=ON \
;
sccache --show-adv-stats;
2 changes: 1 addition & 1 deletion features/src/rapids-build-utils/devcontainer-feature.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "NVIDIA RAPIDS devcontainer build utilities",
"id": "rapids-build-utils",
"version": "24.6.21",
"version": "24.6.22",
"description": "A feature to install the RAPIDS devcontainer build utilities",
"containerEnv": {
"BASH_ENV": "/etc/bash.bash_env"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ generate_script() {
if test -n "${bin}"; then
(
cat - \
| envsubst '$HOME $NAME $SRC_PATH $PY_ENV $PY_SRC $PY_LIB $BIN_DIR $CPP_LIB $CPP_SRC $CPP_CMAKE_ARGS $CPP_CPACK_ARGS $CPP_DEPS $GIT_TAG $GIT_SSH_URL $GIT_HTTPS_URL $GIT_REPO $GIT_HOST $GIT_UPSTREAM $PIP_WHEEL_ARGS $PIP_INSTALL_ARGS' \
| envsubst '$HOME $NAME $SRC_PATH $PY_ENV $PY_SRC $PY_LIB $BIN_DIR $CPP_ENV $CPP_LIB $CPP_SRC $CPP_CMAKE_ARGS $CPP_CPACK_ARGS $CPP_DEPS $GIT_TAG $GIT_SSH_URL $GIT_HTTPS_URL $GIT_REPO $GIT_HOST $GIT_UPSTREAM $PIP_WHEEL_ARGS $PIP_INSTALL_ARGS' \
| tee "${TMP_SCRIPT_DIR}/${bin}" >/dev/null;

chmod +x "${TMP_SCRIPT_DIR}/${bin}";
Expand Down Expand Up @@ -243,6 +243,7 @@ generate_scripts() {

for ((j=0; j < ${!cpp_length:-0}; j+=1)); do

cpp_env="${repo}_cpp_${j}_env";
cpp_name="${repo}_cpp_${j}_name";
cpp_sub_dir="${repo}_cpp_${j}_sub_dir";
cpp_cmake_args="${repo}_cpp_${j}_args_cmake";
Expand Down Expand Up @@ -280,6 +281,7 @@ generate_scripts() {
NAME="${repo_name:-}" \
SRC_PATH=~/"${!repo_path:-}" \
BIN_DIR="${bin_dir}" \
CPP_ENV="${!cpp_env:-}" \
CPP_LIB="${cpp_name:-}" \
CPP_SRC="${!cpp_sub_dir:-}" \
CPP_DEPS="${cpp_deps[*]}" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,14 @@ get_num_archs_jobs_and_load() {
fi

parallel="${j:-${JOBS:-${PARALLEL_LEVEL:-1}}}";
max_archs="${max_archs:-${MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL:-${archs:-3}}}";
max_archs="${max_archs:-${MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL:-${arch:-}}}";
max_device_obj_memory_usage="${max_device_obj_memory_usage:-${MAX_DEVICE_OBJ_MEMORY_USAGE:-1}}";

local n_arch="${archs:-0}";
local n_arch="${archs:-1}";

# currently: 60-real;70-real;75-real;80-real;86-real;90
# see: https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/set_architectures.cmake#L54
local n_arch_rapids=6;

if test -z "${archs:-}" \
&& test -n "${INFER_NUM_DEVICE_ARCHITECTURES:-}"; then
Expand All @@ -69,12 +73,10 @@ get_num_archs_jobs_and_load() {
;;
all | all-major)
# Max out at ${max_archs} threads per job
n_arch=${max_archs};
n_arch="${max_archs:-${n_arch_rapids}}";
;;
ALL | RAPIDS)
# currently: 60-real;70-real;75-real;80-real;86-real;90
# see: https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/set_architectures.cmake#L54
n_arch=6;
n_arch=${n_arch_rapids};
;;
*)
# Otherwise if explicitly defined, count the number of archs in the list
Expand All @@ -83,16 +85,16 @@ get_num_archs_jobs_and_load() {
esac
fi

local mem_for_device_objs=1;

if test "${n_arch}" -le 0; then
n_arch=1;
else
max_archs="${max_archs:-${MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL:-${n_arch}}}";
# Clamp to `min(n_arch, max_archs)` threads per job
n_arch=$((n_arch > max_archs ? max_archs : n_arch));
mem_for_device_objs="$((n_arch * max_device_obj_memory_usage))";
fi

local mem_for_device_objs="$((n_arch * max_device_obj_memory_usage))";

local -r free_mem="$(free --gibi | grep -E '^Mem:' | tr -s '[:space:]' | cut -d' ' -f7 || echo '0')";
local -r freeswap="$(free --gibi | grep -E '^Swap:' | tr -s '[:space:]' | cut -d' ' -f4 || echo '0')";
local -r mem_total="${max_total_system_memory:-${MAX_TOTAL_SYSTEM_MEMORY:-$((free_mem + freeswap))}}";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ build_${CPP_LIB}_cpp() {
local -;
set -euo pipefail;

export ${CPP_ENV} PATH="$PATH";

eval "$( \
PARALLEL_LEVEL=${PARALLEL_LEVEL:-$(nproc --all)} \
rapids-get-num-archs-jobs-and-load "$@" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ configure_${CPP_LIB}_cpp() {
local -;
set -euo pipefail;

export ${CPP_ENV} PATH="$PATH";

eval "$( \
PARALLEL_LEVEL=${PARALLEL_LEVEL:-$(nproc --all)} \
rapids-get-num-archs-jobs-and-load "$@" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ build_${PY_LIB}_python_wheel() {
local -;
set -euo pipefail;

export ${PY_ENV} PATH="$PATH";

eval "$( \
PARALLEL_LEVEL=${PARALLEL_LEVEL:-$(nproc --all)} \
rapids-get-num-archs-jobs-and-load "$@" \
Expand Down Expand Up @@ -85,8 +87,6 @@ build_${PY_LIB}_python_wheel() {
trap "rm -rf '${PY_SRC}/${py_lib//"-"/"_"}.egg-info'" EXIT;

time (
export ${PY_ENV} PATH="$PATH";

local cudaflags="${CUDAFLAGS:+$CUDAFLAGS }-t=${n_arch}";
local build_type="$(rapids-select-cmake-build-type "${cmake_args_[@]}")";
local nvcc_append_flags="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS }-t=${n_arch}";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ install_${PY_LIB}_python() {
local -;
set -euo pipefail;

export ${PY_ENV} PATH="$PATH";

eval "$( \
PARALLEL_LEVEL=${PARALLEL_LEVEL:-$(nproc --all)} \
rapids-get-num-archs-jobs-and-load "$@" \
Expand Down Expand Up @@ -103,8 +105,6 @@ install_${PY_LIB}_python() {
trap "rm -rf '${PY_SRC}/${py_lib//"-"/"_"}.egg-info'" EXIT;

time (
export ${PY_ENV} PATH="$PATH";

local cudaflags="${CUDAFLAGS:+$CUDAFLAGS }-t=${n_arch}";
local build_type="$(rapids-select-cmake-build-type "${cmake_args_[@]}")";
local nvcc_append_flags="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS }-t=${n_arch}";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ repos:
- name: raft
sub_dir: cpp
depends: [rmm]
env: |
MAX_DEVICE_OBJ_MEMORY_USAGE=3
args: {cmake: -DRAFT_COMPILE_LIBRARY=ON}
python:
- name: pylibraft
sub_dir: python/pylibraft
Expand All @@ -99,6 +102,8 @@ repos:
- name: cuvs
sub_dir: cpp
depends: [raft]
env: |
MAX_DEVICE_OBJ_MEMORY_USAGE=3
args: {cmake: -DBUILD_C_LIBRARY=ON}
python:
- name: cuvs
Expand Down Expand Up @@ -179,6 +184,8 @@ repos:
- name: cugraph
sub_dir: cpp
depends: [cugraph-ops]
env: |
MAX_DEVICE_OBJ_MEMORY_USAGE=5
- name: cugraph_etl
sub_dir: cpp/libcugraph_etl
depends: [cudf, cugraph]
Expand Down
2 changes: 1 addition & 1 deletion features/src/utils/devcontainer-feature.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "devcontainer-utils",
"id": "utils",
"version": "24.6.1",
"version": "24.6.2",
"description": "A feature to install RAPIDS devcontainer utility scripts",
"containerEnv": {
"BASH_ENV": "/etc/bash.bash_env"
Expand Down
10 changes: 2 additions & 8 deletions features/src/utils/opt/devcontainer/bin/git/repo/clone.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,8 @@ clone_git_repo() {
git clone "${qj[@]}" "${OPTS[@]}" -- "${origin}" "${directory}";
fi

if ! git -C "${directory}" remote -v show | grep -q origin; then
git -C "${directory}" remote add origin "${origin}" || true;
fi

if ! git -C "${directory}" remote -v show | grep -q upstream; then
git -C "${directory}" remote add upstream "${upstream}" || true;
fi

git -C "${directory}" remote add origin "${origin}" || true;
git -C "${directory}" remote add upstream "${upstream}" || true;
git -C "${directory}" remote set-url upstream "${upstream}" || true;
git -C "${directory}" remote set-url --push upstream read_only || true;
if test "${upstream}" == "${origin}"; then
Expand Down
2 changes: 0 additions & 2 deletions features/src/utils/opt/devcontainer/bin/github/repo/clone.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
# -q,--quiet Operate quietly. Progress is not reported to the standard error stream.
# --no-fork Don't prompt the user to fork the repo if a user fork isn't found.
# (default: false)
# --no-update-env Don't update the Python env with the repo's dependencies after cloning.
# (default: false)
# --clone-upstream Always clone the upstream, not the user's fork.
# (default: false)
#
Expand Down
2 changes: 0 additions & 2 deletions features/src/utils/opt/devcontainer/bin/gitlab/repo/clone.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
# -q,--quiet Operate quietly. Progress is not reported to the standard error stream.
# --no-fork Don't prompt the user to fork the repo if a user fork isn't found.
# (default: false)
# --no-update-env Don't update the Python env with the repo's dependencies after cloning.
# (default: false)
# --clone-upstream Always clone the upstream, not the user's fork.
# (default: false)
#
Expand Down
4 changes: 3 additions & 1 deletion image/.devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
"ppa": "true",
"version": "latest"
},
"ghcr.io/devcontainers/features/git-lfs:1": {},
"ghcr.io/devcontainers/features/git-lfs:1": {
"autoPull": false
},
"ghcr.io/devcontainers/features/github-cli:1": {},
"./features/src/gitlab-cli": {},
"./features/src/cmake": {},
Expand Down

0 comments on commit c76e7ca

Please sign in to comment.