From 91ab62fa4dac6216797ac0b3590a59762408a087 Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Tue, 3 Dec 2024 20:04:16 +0000 Subject: [PATCH 1/2] update nccl-plugin and rxdm versions --- tools/prologs-epilogs/receive-data-path-manager-mega | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/prologs-epilogs/receive-data-path-manager-mega b/tools/prologs-epilogs/receive-data-path-manager-mega index 3cc5a00e..0906d3a4 100755 --- a/tools/prologs-epilogs/receive-data-path-manager-mega +++ b/tools/prologs-epilogs/receive-data-path-manager-mega @@ -30,8 +30,8 @@ fi # ensure that dmabuf-import-helper is loaded modprobe import-helper -NCCL_PLUGIN_IMAGE=us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.4 -RXDM_IMAGE=us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.10 +NCCL_PLUGIN_IMAGE=us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.7 +RXDM_IMAGE=us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.13 RXDM_CONTAINER=receive-datapath-manager-"${SLURM_JOB_ID}" if [[ ${SLURM_SCRIPT_CONTEXT} == "prolog_slurmd" ]]; then docker container list --filter "name=receive-datapath-manager-*" --quiet | xargs --no-run-if-empty docker container stop @@ -44,7 +44,7 @@ if [[ ${SLURM_SCRIPT_CONTEXT} == "prolog_slurmd" ]]; then --network=host \ -v /var/lib:/var/lib \ ${NCCL_PLUGIN_IMAGE} \ - install + install --install-nccl # Modify NCCL env vars for Debian 12. # /var/lib/tcpxo/lib64/nccl-env-profile.sh is written by the nccl-installer container From 8aa68bf04cd80ea38a532f6213e0af3f75907696 Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Wed, 4 Dec 2024 18:56:20 +0000 Subject: [PATCH 2/2] remove and reinstall current libnccl each run --- tools/prologs-epilogs/receive-data-path-manager-mega | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/prologs-epilogs/receive-data-path-manager-mega b/tools/prologs-epilogs/receive-data-path-manager-mega index 0906d3a4..b83d8e5e 100755 --- a/tools/prologs-epilogs/receive-data-path-manager-mega +++ b/tools/prologs-epilogs/receive-data-path-manager-mega @@ -39,6 +39,8 @@ if [[ ${SLURM_SCRIPT_CONTEXT} == "prolog_slurmd" ]]; then export PATH=${PATH}:/usr/local/lib/google-cloud-sdk/bin/ gcloud auth configure-docker --quiet us-docker.pkg.dev 2>&1 &>/dev/null + rm -rf /var/lib/tcpxo/lib64 || true + # Install the nccl, nccl-net lib into /var/lib/tcpxo/lib64/. docker run --rm --name nccl-installer \ --network=host \