-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsumMatrixGPUManaged.cu
More file actions
186 lines (159 loc) · 7 KB
/
sumMatrixGPUManaged.cu
File metadata and controls
186 lines (159 loc) · 7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
/*
* This example demonstrates the use of CUDA managed memory to implement matrix
* addition. In this example, arbitrary pointers can be dereferenced on the host
* and device. CUDA will automatically manage the transfer of data to and from
* the GPU as needed by the application. There is no need for the programmer to
* use cudaMemcpy, cudaHostGetDevicePointer, or any other CUDA API involved with
* explicitly transferring data. In addition, because CUDA managed memory is not
* forced to reside in a single place it can be transferred to the optimal
* memory space and not require round-trips over the PCIe bus every time a
* cross-device reference is performed (as is required with zero copy and UVA).
*/
#define CHECK(call) \
{ \
const cudaError_t error = call;\
if (error != cudaSuccess) { \
printf("[device] Error: %s %d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(-10*error); \
} \
}
void initialData(float* ip, const int size) {
for (int i = 0; i < size; i++) {
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
}
void sumMatrixOnHost(float* A, float* B, float* C, const int nx, const int ny) {
float* ia = A;
float* ib = B;
float* ic = C;
for (int iy = 0; iy < ny; iy++) {
for (int ix = 0; ix < nx; ix++) {
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
}
void checkResult(float* hostRef, float* gpuRef, const int N) {
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++) {
if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
match = 0;
printf("[host] host %f gpu %f\n", hostRef[i], gpuRef[i]);
break;
}
}
if (!match) {
printf("[host] Arrays do not match.\n\n");
}
}
__global__ void warmup(float* MatA, float* MatB, float* MatC, int nx, int ny) {
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny) {
MatC[idx] = MatA[idx] + MatB[idx];
}
}
// grid 2D block 2D
__global__ void sumMatrixGPU(float* MatA, float* MatB, float* MatC, int nx, int ny) {
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny) {
MatC[idx] = MatA[idx] + MatB[idx];
}
}
int main(int argc, char** argv) {
printf("[host] %s Starting ", argv[0]);
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("using Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// set up data size of matrix
int nx, ny;
int ishift = 12;
if (argc > 1) ishift = atoi(argv[1]);
nx = ny = 1 << ishift;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
printf("[host] Matrix size: nx %d ny %d\n", nx, ny);
// malloc host memory
float* A, * B, * hostRef, * gpuRef;
CHECK(cudaMallocManaged((void**)&A, nBytes));
CHECK(cudaMallocManaged((void**)&B, nBytes));
CHECK(cudaMallocManaged((void**)&gpuRef, nBytes); );
CHECK(cudaMallocManaged((void**)&hostRef, nBytes););
// initialize data at host side
initialData(A, nxy);
initialData(B, nxy);
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// add matrix at host side for result checks
sumMatrixOnHost(A, B, hostRef, nx, ny);
// invoke kernel at host side
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
// warm-up kernel, with unified memory all pages will migrate from host to
// device
warmup << <grid, block >> > (A, B, gpuRef, 1, 1);
sumMatrixGPU << <grid, block >> > (A, B, gpuRef, nx, ny);
CHECK(cudaDeviceSynchronize());
// check kernel error
CHECK(cudaGetLastError());
// check device results
checkResult(hostRef, gpuRef, nxy);
// free device global memory
CHECK(cudaFree(A));
CHECK(cudaFree(B));
CHECK(cudaFree(hostRef));
CHECK(cudaFree(gpuRef));
// reset device
CHECK(cudaDeviceReset());
}
/*
output:
c:\coding\Cuda\x64\Debug>nvprof ./Cuda.exe
[host] ./Cuda.exe Starting ==3268== NVPROF is profiling process 3268, command: ./Cuda.exe
using Device 0: NVIDIA GeForce MX450
[host] Matrix size: nx 4096 ny 4096
==3268== Profiling application: ./Cuda.exe
==3268== Warning: 34 API trace records have same start and end timestamps.
This can happen because of short execution duration of CUDA APIs and low timer resolution on the underlying operating system.
==3268== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 79.03% 4.2530ms 1 4.2530ms 4.2530ms 4.2530ms sumMatrixGPU(float*, float*, float*, int, int)
20.97% 1.1286ms 1 1.1286ms 1.1286ms 1.1286ms warmup(float*, float*, float*, int, int)
API calls: 60.85% 363.26ms 2 181.63ms 31.100us 363.23ms cudaLaunchKernel
15.70% 93.709ms 4 23.427ms 6.1693ms 63.253ms cudaMallocManaged
10.84% 64.684ms 1 64.684ms 64.684ms 64.684ms cudaSetDevice
7.90% 47.182ms 4 11.795ms 4.8670ms 18.414ms cudaFree
3.80% 22.656ms 1 22.656ms 22.656ms 22.656ms cudaDeviceReset
0.91% 5.4125ms 1 5.4125ms 5.4125ms 5.4125ms cudaDeviceSynchronize
0.00% 29.500us 1 29.500us 29.500us 29.500us cuLibraryUnload
0.00% 20.300us 114 178ns 0ns 3.4000us cuDeviceGetAttribute
0.00% 2.8000us 1 2.8000us 2.8000us 2.8000us cudaGetDeviceProperties
0.00% 2.2000us 3 733ns 100ns 1.9000us cuDeviceGetCount
0.00% 1.9000us 1 1.9000us 1.9000us 1.9000us cuDeviceTotalMem
0.00% 1.8000us 1 1.8000us 1.8000us 1.8000us cuModuleGetLoadingMode
0.00% 800ns 2 400ns 100ns 700ns cuDeviceGet
0.00% 800ns 1 800ns 800ns 800ns cuDeviceGetName
0.00% 700ns 1 700ns 700ns 700ns cudaGetLastError
0.00% 400ns 1 400ns 400ns 400ns cuDeviceGetLuid
0.00% 200ns 1 200ns 200ns 200ns cuDeviceGetUuid
==3268== Unified Memory profiling result:
Device "NVIDIA GeForce MX450 (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
2048 128.00KB 128.00KB 128.00KB 256.0000MB 296.3975ms Host To Device
2322 169.34KB 64.000KB 1.0000MB 384.0000MB 2.880703s Device To Host
*/