diff --git a/tools/prologs-epilogs/receive-data-path-manager b/tools/prologs-epilogs/receive-data-path-manager index e4b3d062..dcaa60af 100755 --- a/tools/prologs-epilogs/receive-data-path-manager +++ b/tools/prologs-epilogs/receive-data-path-manager @@ -28,11 +28,15 @@ fi # path at which RxDM sockets will be created UDS_PATH="/run/tcpx-${SLURM_JOB_ID}" +NCCL_PLUGIN_IMAGE=us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9-2.19.4-12.0 RXDM_IMAGE=us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.12 if [[ ${SLURM_SCRIPT_CONTEXT} == "prolog_slurmd" ]]; then # Install the TCPX NCCL Plugin - docker run --rm -v /var/lib:/var/lib \ - us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9-2.19.4-12.0 install + docker run --rm --name nccl-installer \ + --network=host \ + -v /var/lib:/var/lib \ + ${NCCL_PLUGIN_IMAGE} \ + install # Start TCPX receive-datapath-manager GPU_NIC_TOPOLOGY=/opt/tcpdirect_benchmark/gpu_rxq_configuration.textproto diff --git a/tools/prologs-epilogs/receive-data-path-manager-mega b/tools/prologs-epilogs/receive-data-path-manager-mega index 0f0215b9..3cc5a00e 100755 --- a/tools/prologs-epilogs/receive-data-path-manager-mega +++ b/tools/prologs-epilogs/receive-data-path-manager-mega @@ -36,13 +36,13 @@ RXDM_CONTAINER=receive-datapath-manager-"${SLURM_JOB_ID}" if [[ ${SLURM_SCRIPT_CONTEXT} == "prolog_slurmd" ]]; then docker container list --filter "name=receive-datapath-manager-*" --quiet | xargs --no-run-if-empty docker container stop - export PATH=${PATH}:/usr/local/lib/google-cloud-sdk/bin/ - gcloud auth configure-docker --quiet us-docker.pkg.dev 2>&1 &>/dev/null + export PATH=${PATH}:/usr/local/lib/google-cloud-sdk/bin/ + gcloud auth configure-docker --quiet us-docker.pkg.dev 2>&1 &>/dev/null # Install the nccl, nccl-net lib into /var/lib/tcpxo/lib64/. docker run --rm --name nccl-installer \ - --pull=always \ - --volume /var/lib:/var/lib \ + --network=host \ + -v /var/lib:/var/lib \ ${NCCL_PLUGIN_IMAGE} \ install @@ -50,12 +50,12 @@ if [[ ${SLURM_SCRIPT_CONTEXT} == "prolog_slurmd" ]]; then # /var/lib/tcpxo/lib64/nccl-env-profile.sh is written by the nccl-installer container # above, and assumes interface names of eth{0..8}. if (grep -q "ID=debian" /etc/os-release && lsb_release -rs | grep -q "12"); then - cat >> /var/lib/tcpxo/lib64/nccl-env-profile.sh <<- EOF -export NCCL_FASTRAK_CTRL_DEV=enp0s12 -export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 -export NCCL_SOCKET_IFNAME=enp0s12 -export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices -EOF + cat >>/var/lib/tcpxo/lib64/nccl-env-profile.sh <<-EOF + export NCCL_FASTRAK_CTRL_DEV=enp0s12 + export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices + EOF fi # Start FasTrak receive-datapath-manager