-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path2-2-2.cu
More file actions
235 lines (215 loc) · 12.2 KB
/
2-2-2.cu
File metadata and controls
235 lines (215 loc) · 12.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<bits/stdc++.h>
using namespace std;
#define CHECK(call) \
{ \
const cudaError_t error = call;\
if (error != cudaSuccess) { \
printf("[device] Error: %s %d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(-10*error); \
} \
}
// Check if the computation results of the CPU and GPU are the same
template<typename T>
void checkResult(T* h_data, T *d_data, const int n) {
int match = 1;
for (int i = 0; i < n; i++) {
if (h_data[i] != d_data[i]) {
match = 0;
printf("[host] Arrays do not match!\n");
printf("[host] host %5d gpu %5d at current %d\n", h_data[i], d_data[i], i);
break;
}
}
if (match) printf("[host] Arrays match.\n\n");
}
#define DIM 1024
__global__ void Kogge_Stone_scan_kernel(char* d_A, int* d_B, unsigned int n) {
__shared__ int AB[DIM];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
// boundary check
if (idx >= n) return;
if ('a' <= d_A[idx] && d_A[idx] <= 'z') {
AB[threadIdx.x] = 1;
}
else {
AB[threadIdx.x] = 0;
}
for (unsigned int stride = 1; stride < blockDim.x; stride*=2) {
__syncthreads();
int temp;
if (threadIdx.x >= stride)
temp = AB[threadIdx.x] + AB[threadIdx.x - stride];
__syncthreads();
if (threadIdx.x >= stride) {
AB[threadIdx.x] = temp;
}
}
if (idx < n) {
d_B[idx] = AB[threadIdx.x];
}
}
__global__ void removeUpperCase(char* d_A, char* d_A_out, int* d_B, unsigned int n) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
// boundary check
if (idx >= n) return;
if ('a' <= d_A[idx] && d_A[idx] <= 'z') {
d_A_out[d_B[idx] - 1] = d_A[idx];
}
}
int main(int argc, char** argv) {
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("[host] %s starting transpose at ", argv[0]);
printf("device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
/* host code */
char in_filename[100];
sprintf(in_filename, "%s.in", argv[1]);
freopen(in_filename, "r", stdin);
string A; cin >> A;
string B;
for (int i = 0; i < A.size(); i++) {
if ('a' <= A[i] && A[i] <= 'z') B = B + A[i];
}
int m = (int)B.length();
char* h_answer = (char*)malloc((m + 1) * sizeof(char));
strcpy(h_answer, B.c_str());
//printf("[host] host answer : %s\n", h_answer);
/* host - device code */
int n = (int)A.size();
int nbytes = n * sizeof(char);
char* h_A = (char*)malloc(nbytes + 1);
strcpy(h_A, A.c_str());
/* device code */
char* d_A; char* d_A_out;
int* d_B;
dim3 block(1024, 1);
dim3 grid((n + block.x - 1) / block.x, 1);
CHECK(cudaMalloc((void**)&d_A, nbytes + 1));
CHECK(cudaMalloc((void**)&d_A_out, nbytes + 1));
CHECK(cudaMemset(d_A_out, 0, nbytes + 1));
CHECK(cudaMalloc((void**)&d_B, n * sizeof(int)));
CHECK(cudaMemset(d_B, 0, n * sizeof(int)));
cudaMemcpy(d_A, h_A, nbytes + 1, cudaMemcpyHostToDevice);
printf("[host] datasize (%d), grid(%d, %d), block(%d, %d)\n", nbytes, grid.x, grid.y, block.x, block.y);
Kogge_Stone_scan_kernel << <grid, block >> > (d_A, d_B, n);
cudaDeviceSynchronize();
removeUpperCase << <grid, block >> > (d_A, d_A_out, d_B, n);
cudaDeviceSynchronize();
char* d_answer = (char*)malloc(m + 1);
memset(d_answer, 0, m + 1);
CHECK(cudaMemcpy(d_answer, d_A_out, m, cudaMemcpyDeviceToHost));
//printf("[host] device answer : %s\n", d_answer);
checkResult<char>(h_answer, d_answer, m);
// memory free
free(h_A); free(h_answer);
cudaFree(d_A); cudaFree(d_A_out); cudaFree(d_B); cudaFree(d_answer);
}
/*
output:
C:\coding\Cuda\x64\Debug>nvprof ./Cuda.exe 1
==28984== NVPROF is profiling process 28984, command: ./Cuda.exe 1
[host] ./Cuda.exe starting transpose at device 0: NVIDIA GeForce MX450
[host] datasize (1000), grid(1, 1), block(1024, 1)
[host] Arrays match.
==28984== Profiling application: ./Cuda.exe 1
==28984== Warning: 32 API trace records have same start and end timestamps.
This can happen because of short execution duration of CUDA APIs and low timer resolution on the underlying operating system.
==28984== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 67.05% 18.624us 1 18.624us 18.624us 18.624us Kogge_Stone_scan_kernel(char*, int*, unsigned int)
11.98% 3.3280us 1 3.3280us 3.3280us 3.3280us removeUpperCase(char*, char*, int*, unsigned int)
8.76% 2.4330us 2 1.2160us 993ns 1.4400us [CUDA memset]
7.72% 2.1440us 1 2.1440us 2.1440us 2.1440us [CUDA memcpy DtoH]
4.50% 1.2490us 1 1.2490us 1.2490us 1.2490us [CUDA memcpy HtoD]
API calls: 74.60% 82.709ms 1 82.709ms 82.709ms 82.709ms cudaSetDevice
22.84% 25.326ms 1 25.326ms 25.326ms 25.326ms cuDevicePrimaryCtxRelease
1.87% 2.0735ms 2 1.0368ms 18.500us 2.0550ms cudaLaunchKernel
0.27% 294.70us 4 73.675us 1.8000us 275.10us cudaFree
0.15% 167.90us 2 83.950us 62.200us 105.70us cudaMemcpy
0.11% 127.30us 3 42.433us 3.0000us 105.70us cudaMalloc
0.05% 55.400us 2 27.700us 5.9000us 49.500us cudaMemset
0.04% 44.700us 2 22.350us 8.0000us 36.700us cudaDeviceSynchronize
0.03% 36.500us 1 36.500us 36.500us 36.500us cuLibraryUnload
0.02% 20.800us 114 182ns 0ns 2.8000us cuDeviceGetAttribute
0.00% 4.3000us 1 4.3000us 4.3000us 4.3000us cudaGetDeviceProperties
0.00% 2.5000us 3 833ns 0ns 2.2000us cuDeviceGetCount
0.00% 1.9000us 1 1.9000us 1.9000us 1.9000us cuModuleGetLoadingMode
0.00% 1.8000us 1 1.8000us 1.8000us 1.8000us cuDeviceTotalMem
0.00% 800ns 2 400ns 0ns 800ns cuDeviceGet
0.00% 700ns 1 700ns 700ns 700ns cuDeviceGetName
0.00% 400ns 1 400ns 400ns 400ns cuDeviceGetLuid
0.00% 200ns 1 200ns 200ns 200ns cuDeviceGetUuid
C:\coding\Cuda\x64\Debug>nvprof ./Cuda.exe 2
==19512== NVPROF is profiling process 19512, command: ./Cuda.exe 2
[host] ./Cuda.exe starting transpose at device 0: NVIDIA GeForce MX450
[host] datasize (1000), grid(1, 1), block(1024, 1)
[host] Arrays match.
==19512== Profiling application: ./Cuda.exe 2
==19512== Warning: 12 API trace records have same start and end timestamps.
This can happen because of short execution duration of CUDA APIs and low timer resolution on the underlying operating system.
==19512== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 67.13% 18.559us 1 18.559us 18.559us 18.559us Kogge_Stone_scan_kernel(char*, int*, unsigned int)
11.69% 3.2320us 1 3.2320us 3.2320us 3.2320us removeUpperCase(char*, char*, int*, unsigned int)
8.92% 2.4660us 2 1.2330us 1.0250us 1.4410us [CUDA memset]
7.87% 2.1750us 1 2.1750us 2.1750us 2.1750us [CUDA memcpy DtoH]
4.40% 1.2160us 1 1.2160us 1.2160us 1.2160us [CUDA memcpy HtoD]
API calls: 71.07% 67.109ms 1 67.109ms 67.109ms 67.109ms cudaSetDevice
26.28% 24.815ms 1 24.815ms 24.815ms 24.815ms cuDevicePrimaryCtxRelease
1.31% 1.2376ms 2 618.80us 11.000us 1.2266ms cudaLaunchKernel
0.59% 554.90us 4 138.73us 3.7000us 525.50us cudaFree
0.29% 273.00us 2 136.50us 82.600us 190.40us cudaMemcpy
0.17% 160.20us 3 53.400us 2.7000us 120.20us cudaMalloc
0.12% 112.80us 1 112.80us 112.80us 112.80us cuLibraryUnload
0.06% 60.900us 2 30.450us 5.6000us 55.300us cudaMemset
0.06% 52.900us 2 26.450us 7.9000us 45.000us cudaDeviceSynchronize
0.04% 36.800us 114 322ns 0ns 12.200us cuDeviceGetAttribute
0.01% 4.9000us 1 4.9000us 4.9000us 4.9000us cudaGetDeviceProperties
0.00% 2.4000us 1 2.4000us 2.4000us 2.4000us cuDeviceTotalMem
0.00% 2.2000us 1 2.2000us 2.2000us 2.2000us cuModuleGetLoadingMode
0.00% 1.8000us 3 600ns 100ns 1.4000us cuDeviceGetCount
0.00% 1.2000us 2 600ns 0ns 1.2000us cuDeviceGet
0.00% 800ns 1 800ns 800ns 800ns cuDeviceGetName
0.00% 400ns 1 400ns 400ns 400ns cuDeviceGetLuid
0.00% 200ns 1 200ns 200ns 200ns cuDeviceGetUuid
C:\coding\Cuda\x64\Debug>nvprof ./Cuda.exe 3
==20284== NVPROF is profiling process 20284, command: ./Cuda.exe 3
[host] ./Cuda.exe starting transpose at device 0: NVIDIA GeForce MX450
[host] datasize (1000), grid(1, 1), block(1024, 1)
[host] Arrays match.
==20284== Profiling application: ./Cuda.exe 3
==20284== Warning: 31 API trace records have same start and end timestamps.
This can happen because of short execution duration of CUDA APIs and low timer resolution on the underlying operating system.
==20284== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 66.63% 18.720us 1 18.720us 18.720us 18.720us Kogge_Stone_scan_kernel(char*, int*, unsigned int)
11.84% 3.3270us 1 3.3270us 3.3270us 3.3270us removeUpperCase(char*, char*, int*, unsigned int)
8.77% 2.4640us 2 1.2320us 1.0240us 1.4400us [CUDA memset]
7.75% 2.1760us 1 2.1760us 2.1760us 2.1760us [CUDA memcpy DtoH]
5.01% 1.4080us 1 1.4080us 1.4080us 1.4080us [CUDA memcpy HtoD]
API calls: 71.47% 61.935ms 1 61.935ms 61.935ms 61.935ms cudaSetDevice
26.28% 22.773ms 1 22.773ms 22.773ms 22.773ms cuDevicePrimaryCtxRelease
1.16% 1.0089ms 2 504.45us 11.700us 997.20us cudaLaunchKernel
0.39% 341.70us 4 85.425us 1.2000us 310.90us cudaFree
0.20% 175.50us 2 87.750us 46.800us 128.70us cudaMemcpy
0.17% 149.80us 1 149.80us 149.80us 149.80us cuLibraryUnload
0.17% 149.50us 3 49.833us 2.7000us 142.50us cudaMalloc
0.08% 69.700us 2 34.850us 32.700us 37.000us cudaMemset
0.03% 28.300us 2 14.150us 8.3000us 20.000us cudaDeviceSynchronize
0.02% 19.500us 114 171ns 0ns 2.5000us cuDeviceGetAttribute
0.00% 4.2000us 1 4.2000us 4.2000us 4.2000us cudaGetDeviceProperties
0.00% 2.8000us 3 933ns 0ns 2.3000us cuDeviceGetCount
0.00% 2.7000us 1 2.7000us 2.7000us 2.7000us cuModuleGetLoadingMode
0.00% 1.7000us 1 1.7000us 1.7000us 1.7000us cuDeviceTotalMem
0.00% 900ns 2 450ns 100ns 800ns cuDeviceGet
0.00% 700ns 1 700ns 700ns 700ns cuDeviceGetName
0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetLuid
0.00% 200ns 1 200ns 200ns 200ns cuDeviceGetUuid
*/