Skip to content

Commit ba2bc42

Browse files
authored
nccl example for d2d, d2h, h2d, h2h
1 parent 3b71fc0 commit ba2bc42

File tree

1 file changed

+171
-0
lines changed

1 file changed

+171
-0
lines changed

sample_nccl.cc

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// This is an expansion of the code shown at https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-1-one-device-per-process-or-thread
2+
// if send/recv buffer are regular host variables, then illegal memory will be produced
3+
// mpicc sample_nccl.cc -I$CUDA_HOME/include -L$CUDA_HOME/lib64 -lcudart -I$NCCL_HOME/include -L$NCCL_HOME/lib -lnccl
4+
#include <stdio.h>
5+
#include "cuda_runtime.h"
6+
#include "nccl.h"
7+
#include "mpi.h"
8+
#include <unistd.h>
9+
#include <stdint.h>
10+
#include <stdlib.h>
11+
12+
13+
#define MPICHECK(cmd) do { \
14+
int e = cmd; \
15+
if( e != MPI_SUCCESS ) { \
16+
printf("Failed: MPI error %s:%d '%d'\n", \
17+
__FILE__,__LINE__, e); \
18+
exit(EXIT_FAILURE); \
19+
} \
20+
} while(0)
21+
22+
23+
#define CUDACHECK(cmd) do { \
24+
cudaError_t e = cmd; \
25+
if( e != cudaSuccess ) { \
26+
printf("Failed: Cuda error %s:%d '%s'\n", \
27+
__FILE__,__LINE__,cudaGetErrorString(e)); \
28+
exit(EXIT_FAILURE); \
29+
} \
30+
} while(0)
31+
32+
33+
#define NCCLCHECK(cmd) do { \
34+
ncclResult_t r = cmd; \
35+
if (r!= ncclSuccess) { \
36+
printf("Failed, NCCL error %s:%d '%s'\n", \
37+
__FILE__,__LINE__,ncclGetErrorString(r)); \
38+
exit(EXIT_FAILURE); \
39+
} \
40+
} while(0)
41+
42+
43+
static uint64_t getHostHash(const char* string) {
44+
// Based on DJB2a, result = result * 33 ^ char
45+
uint64_t result = 5381;
46+
for (int c = 0; string[c] != '\0'; c++){
47+
result = ((result << 5) + result) ^ string[c];
48+
}
49+
return result;
50+
}
51+
52+
53+
static void getHostName(char* hostname, int maxlen) {
54+
gethostname(hostname, maxlen);
55+
for (int i=0; i< maxlen; i++) {
56+
if (hostname[i] == '.') {
57+
hostname[i] = '\0';
58+
return;
59+
}
60+
}
61+
}
62+
63+
64+
int main(int argc, char* argv[])
65+
{
66+
int size = 32*1024*1024;
67+
68+
69+
int myRank, nRanks, localRank = 0;
70+
71+
72+
//initializing MPI
73+
MPICHECK(MPI_Init(&argc, &argv));
74+
MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
75+
MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
76+
77+
78+
//calculating localRank based on hostname which is used in selecting a GPU
79+
uint64_t hostHashs[nRanks];
80+
char hostname[1024];
81+
getHostName(hostname, 1024);
82+
hostHashs[myRank] = getHostHash(hostname);
83+
MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
84+
for (int p=0; p<nRanks; p++) {
85+
if (p == myRank) break;
86+
if (hostHashs[p] == hostHashs[myRank]) localRank++;
87+
}
88+
89+
90+
ncclUniqueId id;
91+
ncclComm_t comm;
92+
float *sendbuff, *recvbuff;
93+
float *send_h, *recv_h;
94+
95+
96+
//get NCCL unique ID at rank 0 and broadcast it to all others
97+
if (myRank == 0) ncclGetUniqueId(&id);
98+
MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
99+
100+
101+
//picking a GPU based on localRank, allocate device buffers
102+
CUDACHECK(cudaSetDevice(localRank));
103+
CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
104+
CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
105+
CUDACHECK(cudaMallocManaged(&send_h, size * sizeof(float)));
106+
CUDACHECK(cudaMallocManaged(&recv_h, size * sizeof(float)));
107+
108+
//initializing NCCL
109+
NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
110+
111+
{
112+
printf("d2d\n");
113+
cudaStream_t s;
114+
CUDACHECK(cudaStreamCreate(&s));
115+
//communicating using NCCL
116+
NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, ncclSum,
117+
comm, s));
118+
//completing NCCL operation by synchronizing on the CUDA stream
119+
CUDACHECK(cudaStreamSynchronize(s));
120+
}
121+
122+
{
123+
printf("h2h\n");
124+
cudaStream_t s;
125+
CUDACHECK(cudaStreamCreate(&s));
126+
//communicating using NCCL
127+
NCCLCHECK(ncclAllReduce((const void*)send_h, (void*)recv_h, size, ncclFloat, ncclSum,
128+
comm, s));
129+
//completing NCCL operation by synchronizing on the CUDA stream
130+
CUDACHECK(cudaStreamSynchronize(s));
131+
}
132+
133+
{
134+
printf("d2h\n");
135+
cudaStream_t s;
136+
CUDACHECK(cudaStreamCreate(&s));
137+
//communicating using NCCL
138+
NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recv_h, size, ncclFloat, ncclSum,
139+
comm, s));
140+
//completing NCCL operation by synchronizing on the CUDA stream
141+
CUDACHECK(cudaStreamSynchronize(s));
142+
}
143+
144+
{
145+
printf("h2d\n");
146+
cudaStream_t s;
147+
CUDACHECK(cudaStreamCreate(&s));
148+
//communicating using NCCL
149+
NCCLCHECK(ncclAllReduce((const void*)send_h, (void*)recvbuff, size, ncclFloat, ncclSum,
150+
comm, s));
151+
//completing NCCL operation by synchronizing on the CUDA stream
152+
CUDACHECK(cudaStreamSynchronize(s));
153+
}
154+
//free device buffers
155+
CUDACHECK(cudaFree(sendbuff));
156+
CUDACHECK(cudaFree(recvbuff));
157+
CUDACHECK(cudaFree(send_h));
158+
CUDACHECK(cudaFree(recv_h));
159+
160+
161+
//finalizing NCCL
162+
ncclCommDestroy(comm);
163+
164+
165+
//finalizing MPI
166+
MPICHECK(MPI_Finalize());
167+
168+
169+
printf("[MPI Rank %d] Success \n", myRank);
170+
return 0;
171+
}

0 commit comments

Comments
 (0)