nccl example for d2d, d2h, h2d, h2h

hpjeonGIT · web-flow · commit ba2bc42c0961 · 2024-03-12T14:48:20.000-04:00
diff --git a/sample_nccl.cc b/sample_nccl.cc
@@ -0,0 +1,171 @@
+// This is an expansion of the code shown at https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-1-one-device-per-process-or-thread
+// if send/recv buffer are regular host variables, then illegal memory will be produced
+// mpicc sample_nccl.cc -I$CUDA_HOME/include -L$CUDA_HOME/lib64 -lcudart -I$NCCL_HOME/include -L$NCCL_HOME/lib -lnccl
+#include <stdio.h>
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include "mpi.h"
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+
+#define MPICHECK(cmd) do {                          \
+  int e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%d'\n",        \
+        __FILE__,__LINE__, e);   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t e = cmd;                              \
+  if( e != cudaSuccess ) {                          \
+    printf("Failed: Cuda error %s:%d '%s'\n",             \
+        __FILE__,__LINE__,cudaGetErrorString(e));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("Failed, NCCL error %s:%d '%s'\n",             \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+
+static uint64_t getHostHash(const char* string) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (int c = 0; string[c] != '\0'; c++){
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+        hostname[i] = '\0';
+        return;
+    }
+  }
+}
+
+
+int main(int argc, char* argv[])
+{
+  int size = 32*1024*1024;
+
+
+  int myRank, nRanks, localRank = 0;
+
+
+  //initializing MPI
+  MPICHECK(MPI_Init(&argc, &argv));
+  MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
+  MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
+
+
+  //calculating localRank based on hostname which is used in selecting a GPU
+  uint64_t hostHashs[nRanks];
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  hostHashs[myRank] = getHostHash(hostname);
+  MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
+  for (int p=0; p<nRanks; p++) {
+     if (p == myRank) break;
+     if (hostHashs[p] == hostHashs[myRank]) localRank++;
+  }
+
+
+  ncclUniqueId id;
+  ncclComm_t comm;
+  float *sendbuff, *recvbuff;
+  float *send_h, *recv_h;
+
+
+  //get NCCL unique ID at rank 0 and broadcast it to all others
+  if (myRank == 0) ncclGetUniqueId(&id);
+  MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
+
+
+  //picking a GPU based on localRank, allocate device buffers
+  CUDACHECK(cudaSetDevice(localRank));
+  CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
+  CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
+  CUDACHECK(cudaMallocManaged(&send_h, size * sizeof(float)));
+  CUDACHECK(cudaMallocManaged(&recv_h, size * sizeof(float)));
+
+  //initializing NCCL
+  NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
+
+{
+  printf("d2d\n");
+  cudaStream_t s;
+  CUDACHECK(cudaStreamCreate(&s));
+  //communicating using NCCL
+  NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, ncclSum,
+        comm, s));
+  //completing NCCL operation by synchronizing on the CUDA stream
+  CUDACHECK(cudaStreamSynchronize(s));
+}
+
+{
+  printf("h2h\n");
+  cudaStream_t s;
+  CUDACHECK(cudaStreamCreate(&s));
+  //communicating using NCCL
+  NCCLCHECK(ncclAllReduce((const void*)send_h, (void*)recv_h, size, ncclFloat, ncclSum,
+        comm, s));
+  //completing NCCL operation by synchronizing on the CUDA stream
+  CUDACHECK(cudaStreamSynchronize(s));
+}
+
+{
+  printf("d2h\n");
+  cudaStream_t s;
+  CUDACHECK(cudaStreamCreate(&s));
+  //communicating using NCCL
+  NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recv_h, size, ncclFloat, ncclSum,
+        comm, s));
+  //completing NCCL operation by synchronizing on the CUDA stream
+  CUDACHECK(cudaStreamSynchronize(s));
+}
+
+{
+  printf("h2d\n");
+  cudaStream_t s;
+  CUDACHECK(cudaStreamCreate(&s));
+  //communicating using NCCL
+  NCCLCHECK(ncclAllReduce((const void*)send_h, (void*)recvbuff, size, ncclFloat, ncclSum,
+        comm, s));
+  //completing NCCL operation by synchronizing on the CUDA stream
+  CUDACHECK(cudaStreamSynchronize(s));
+}
+  //free device buffers
+  CUDACHECK(cudaFree(sendbuff));
+  CUDACHECK(cudaFree(recvbuff));
+  CUDACHECK(cudaFree(send_h));
+  CUDACHECK(cudaFree(recv_h));
+
+
+  //finalizing NCCL
+  ncclCommDestroy(comm);
+
+
+  //finalizing MPI
+  MPICHECK(MPI_Finalize());
+
+
+  printf("[MPI Rank %d] Success \n", myRank);
+  return 0;
+}