-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path2-1-2.cu
More file actions
204 lines (186 loc) · 10.5 KB
/
2-1-2.cu
File metadata and controls
204 lines (186 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define CHECK(call) \
{ \
const cudaError_t error = call;\
if (error != cudaSuccess) { \
printf("[device] Error: %s %d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(-10*error); \
} \
}
// Check if the computation results of the CPU and GPU are the same
void checkResultInt(int* h_data, int *d_data, const int n) {
int match = 1;
for (int i = 0; i < n; i++) {
if (h_data[i] != d_data[i]) {
match = 0;
printf("[host] Arrays do not match!\n");
printf("[host] host %5d gpu %5d at current %d\n", h_data[i], d_data[i], i);
break;
}
}
if (match) printf("[host] Arrays match.\n\n");
}
__global__ void sumArrayElementK(int* g_idata, int* g_odata, unsigned int n, int s, int e, int k) {
unsigned int tid = threadIdx.x;
int* idata = g_idata + blockIdx.x * blockDim.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// boundary check
if (idx >= n) return;
// branch divergence!!!
if (s <= idx && idx<= e) g_idata[idx] *= k;
// in-place reduction in global memory
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
if (tid < stride) {
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
int main(int argc, char** argv) {
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("[host] %s starting transpose at ", argv[0]);
printf("device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
char in_filename[100];
sprintf(in_filename, "%s.in", argv[1]);
freopen(in_filename, "r", stdin);
int n, s, e, k;
scanf("%d", &n);
int nbytes = n * sizeof(int);
int* h_A = (int*)malloc(nbytes);
for (int i = 0; i < n; i++) {
scanf("%d", h_A + i);
}
scanf("%d%d%d", &s, &e, &k);
int h_answer = 0;
for (int i = 0; i < n; i++) {
if (s <= i && i <= e) h_answer += h_A[i] * k;
else h_answer += h_A[i];
}
printf("[host] host answer : %d\n", h_answer);
int* d_idata, * d_odata;
int blocksize = 512;
int size = n;
dim3 block(blocksize, 1);
dim3 grid((nbytes + blocksize - 1) / blocksize, 1);
CHECK(cudaMalloc((void**)&d_idata, nbytes));
CHECK(cudaMalloc((void**)&d_odata, grid.x * sizeof(int)));
cudaMemcpy(d_idata, h_A, nbytes, cudaMemcpyHostToDevice);
printf("[host] datasize (%d), gird(%d), block(%d)\n", size, grid.x, block.x);
sumArrayElementK << <grid, block >> > (d_idata, d_odata, n, s, e, k);
int* h_odata = (int*)malloc(grid.x * sizeof(int));
cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
int d_answer = 0;
for (int i = 0; i < grid.x; i++) d_answer += h_odata[i];
printf("[host] device answer : %d\n", d_answer);
checkResultInt(&h_answer, &d_answer, 1);
// memory free
free(h_A);
free(h_odata);
cudaFree(d_idata);
cudaFree(d_odata);
}
/*
output:
c:\coding\Cuda\x64\Debug>nvprof ./Cuda.exe 1
==3924== NVPROF is profiling process 3924, command: ./Cuda.exe 1
[host] ./Cuda.exe starting transpose at device 0: NVIDIA GeForce MX450
[host] host answer : 53697915
[host] datasize (10000), gird(79), block(512)
[host] device answer : 53697915
[host] Arrays match.
==3924== Profiling application: ./Cuda.exe 1
==3924== Warning: 34 API trace records have same start and end timestamps.
This can happen because of short execution duration of CUDA APIs and low timer resolution on the underlying operating system.
==3924== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 55.56% 10.240us 1 10.240us 10.240us 10.240us sumArrayElementK(int*, int*, unsigned int, int, int, int)
32.81% 6.0480us 1 6.0480us 6.0480us 6.0480us [CUDA memcpy HtoD]
11.63% 2.1440us 1 2.1440us 2.1440us 2.1440us [CUDA memcpy DtoH]
API calls: 46.64% 95.833ms 1 95.833ms 95.833ms 95.833ms cudaLaunchKernel
36.44% 74.869ms 1 74.869ms 74.869ms 74.869ms cudaSetDevice
16.37% 33.647ms 1 33.647ms 33.647ms 33.647ms cuDevicePrimaryCtxRelease
0.26% 535.00us 2 267.50us 20.500us 514.50us cudaFree
0.15% 305.60us 2 152.80us 4.6000us 301.00us cudaMalloc
0.09% 177.70us 2 88.850us 64.600us 113.10us cudaMemcpy
0.03% 54.100us 114 474ns 0ns 29.500us cuDeviceGetAttribute
0.02% 43.600us 1 43.600us 43.600us 43.600us cuLibraryUnload
0.00% 5.0000us 1 5.0000us 5.0000us 5.0000us cudaGetDeviceProperties
0.00% 2.5000us 1 2.5000us 2.5000us 2.5000us cuDeviceTotalMem
0.00% 2.0000us 3 666ns 100ns 1.6000us cuDeviceGetCount
0.00% 2.0000us 1 2.0000us 2.0000us 2.0000us cuModuleGetLoadingMode
0.00% 800ns 2 400ns 0ns 800ns cuDeviceGet
0.00% 800ns 1 800ns 800ns 800ns cuDeviceGetName
0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetLuid
0.00% 200ns 1 200ns 200ns 200ns cuDeviceGetUuid
c:\coding\Cuda\x64\Debug>nvprof ./Cuda.exe 2
==13332== NVPROF is profiling process 13332, command: ./Cuda.exe 2
[host] ./Cuda.exe starting transpose at device 0: NVIDIA GeForce MX450
[host] host answer : 203276154
[host] datasize (10000), gird(79), block(512)
[host] device answer : 203276154
[host] Arrays match.
==13332== Profiling application: ./Cuda.exe 2
==13332== Warning: 31 API trace records have same start and end timestamps.
This can happen because of short execution duration of CUDA APIs and low timer resolution on the underlying operating system.
==13332== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 55.10% 10.368us 1 10.368us 10.368us 10.368us sumArrayElementK(int*, int*, unsigned int, int, int, int)
32.49% 6.1130us 1 6.1130us 6.1130us 6.1130us [CUDA memcpy HtoD]
12.41% 2.3360us 1 2.3360us 2.3360us 2.3360us [CUDA memcpy DtoH]
API calls: 70.92% 72.250ms 1 72.250ms 72.250ms 72.250ms cudaSetDevice
26.88% 27.382ms 1 27.382ms 27.382ms 27.382ms cuDevicePrimaryCtxRelease
1.12% 1.1428ms 1 1.1428ms 1.1428ms 1.1428ms cudaLaunchKernel
0.39% 394.80us 2 197.40us 40.800us 354.00us cudaFree
0.38% 383.00us 2 191.50us 7.7000us 375.30us cudaMalloc
0.21% 209.10us 2 104.55us 69.100us 140.00us cudaMemcpy
0.04% 44.700us 1 44.700us 44.700us 44.700us cuLibraryUnload
0.04% 40.600us 114 356ns 0ns 23.100us cuDeviceGetAttribute
0.02% 21.900us 2 10.950us 100ns 21.800us cuDeviceGet
0.01% 5.3000us 1 5.3000us 5.3000us 5.3000us cudaGetDeviceProperties
0.00% 2.2000us 3 733ns 100ns 1.9000us cuDeviceGetCount
0.00% 1.9000us 1 1.9000us 1.9000us 1.9000us cuDeviceTotalMem
0.00% 1.8000us 1 1.8000us 1.8000us 1.8000us cuModuleGetLoadingMode
0.00% 800ns 1 800ns 800ns 800ns cuDeviceGetName
0.00% 600ns 1 600ns 600ns 600ns cuDeviceGetUuid
0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetLuid
c:\coding\Cuda\x64\Debug>nvprof ./Cuda.exe 3
==19584== NVPROF is profiling process 19584, command: ./Cuda.exe 3
[host] ./Cuda.exe starting transpose at device 0: NVIDIA GeForce MX450
[host] host answer : 144294235
[host] datasize (10000), gird(79), block(512)
[host] device answer : 144294235
[host] Arrays match.
==19584== Profiling application: ./Cuda.exe 3
==19584== Warning: 36 API trace records have same start and end timestamps.
This can happen because of short execution duration of CUDA APIs and low timer resolution on the underlying operating system.
==19584== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 55.03% 10.336us 1 10.336us 10.336us 10.336us sumArrayElementK(int*, int*, unsigned int, int, int, int)
33.56% 6.3040us 1 6.3040us 6.3040us 6.3040us [CUDA memcpy HtoD]
11.41% 2.1440us 1 2.1440us 2.1440us 2.1440us [CUDA memcpy DtoH]
API calls: 70.38% 70.508ms 1 70.508ms 70.508ms 70.508ms cudaSetDevice
26.89% 26.937ms 1 26.937ms 26.937ms 26.937ms cuDevicePrimaryCtxRelease
1.21% 1.2144ms 1 1.2144ms 1.2144ms 1.2144ms cudaLaunchKernel
0.84% 846.20us 2 423.10us 26.100us 820.10us cudaFree
0.32% 321.00us 2 160.50us 6.3000us 314.70us cudaMalloc
0.20% 201.00us 2 100.50us 59.200us 141.80us cudaMemcpy
0.09% 86.800us 1 86.800us 86.800us 86.800us cuLibraryUnload
0.05% 54.600us 114 478ns 0ns 36.900us cuDeviceGetAttribute
0.00% 4.7000us 1 4.7000us 4.7000us 4.7000us cudaGetDeviceProperties
0.00% 1.9000us 3 633ns 100ns 1.6000us cuDeviceGetCount
0.00% 1.9000us 1 1.9000us 1.9000us 1.9000us cuDeviceTotalMem
0.00% 1.7000us 1 1.7000us 1.7000us 1.7000us cuModuleGetLoadingMode
0.00% 1.2000us 1 1.2000us 1.2000us 1.2000us cuDeviceGetName
0.00% 900ns 2 450ns 0ns 900ns cuDeviceGet
0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetLuid
0.00% 200ns 1 200ns 200ns 200ns cuDeviceGetUuid
*/