From 60473d16230986aa91f786107c2f78533010cf27 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Tue, 25 Jun 2024 11:55:52 -0700 Subject: [PATCH] Update BM.GPU.H100.8-nccl-test.yaml --- manifests/BM.GPU.H100.8-nccl-test.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/manifests/BM.GPU.H100.8-nccl-test.yaml b/manifests/BM.GPU.H100.8-nccl-test.yaml index b433f57..e303dad 100644 --- a/manifests/BM.GPU.H100.8-nccl-test.yaml +++ b/manifests/BM.GPU.H100.8-nccl-test.yaml @@ -51,7 +51,7 @@ spec: -x IB_RX_QUEUE_LEN=8192 \ -x NCCL_SOCKET_IFNAME=eth0 \ -x NCCL_IGNORE_CPU_AFFINITY=1 \ - /workspace/nccl-tests/build/alltoall_perf -b 8 -f 2 -g 1 -e 4G -c 1 + /workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 4G -c 1 while :; do { [[ $exit ]] && break; }; sleep 1; done ports: - { name: mpijob-port, containerPort: 2222, protocol: TCP } @@ -59,11 +59,11 @@ spec: name: mpimaster resources: limits: - ephemeral-storage: 32Gi + ephemeral-storage: 16Gi requests: - cpu: 128 - ephemeral-storage: 32Gi - memory: 512Gi + cpu: 4 + ephemeral-storage: 16Gi + memory: 1Gi securityContext: privileged: true capabilities: @@ -121,4 +121,4 @@ spec: - { key: nvidia.com/gpu, operator: Exists } volumes: - { name: devinf, hostPath: { path: /dev/infiniband }} - - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} \ No newline at end of file + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}