-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cu
330 lines (257 loc) · 8.75 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#include <iostream>
#include <cuComplex.h>
#include <random>
#include <complex>
#include <vector>
#include <chrono>
#define MAX_THREADS_PER_BLOCK 1024
using namespace std::chrono;
__global__ void s0_butterfly(const cuDoubleComplex *x, cuDoubleComplex *X, unsigned int l2N);
__global__ void s1_butterfly(unsigned int s, cuDoubleComplex *X);
__device__ size_t bit_reverse(size_t x, size_t bits);
void cuda_malloc_with_error(void** ptr, size_t size);
void cuda_memcpy_with_error(void* dst, void* src, size_t cnt, cudaMemcpyKind kind);
void cuda_free_with_error(void* ptr);
unsigned int min_val(unsigned int a, unsigned int b);
unsigned int max_val(unsigned int a, unsigned int b);
int l2(int x);
std::complex<double> comp_exp(double th);
bool all_close(const std::vector<std::complex<double>>& a, const cuDoubleComplex* b, size_t N, double threshold);
void fft(const cuDoubleComplex* x_h, int N, cuDoubleComplex* X_h);
std::vector<std::complex<double>> fft_baseline(const std::vector<std::complex<double>>& x);
int main() {
int N = 1 << 24;
std::random_device rd;
std::default_random_engine gen(0);
std::uniform_real_distribution<double> dist(0.0, 1.0);
auto* x = (cuDoubleComplex*) malloc(sizeof(cuDoubleComplex) * N);
std::vector<std::complex<double>> baseline;
for (int i = 0; i < N; ++i)
{
double Re = dist(gen);
double Im = dist(gen);
x[i] = make_cuDoubleComplex(Re, Im);
baseline.emplace_back(Re, Im);
}
high_resolution_clock::time_point t1 = high_resolution_clock::now();
auto gt = fft_baseline(baseline);
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto baseline_time = duration_cast<microseconds>(t2 - t1).count();
auto* X = (cuDoubleComplex*) malloc(sizeof(cuDoubleComplex) * N);
t1 = high_resolution_clock::now();
fft(x, N, X);
t2 = high_resolution_clock::now();
auto gpu_time = duration_cast<microseconds>(t2 - t1).count();
free(x);
if (not all_close(gt, X, N, 0.001))
{
std::cout << "failed" << std::endl;
}
std::cout << "CPU FFT took " << baseline_time << " microseconds" << std::endl;
std::cout << "GPU FFT took " << gpu_time << " microseconds" << std::endl;
free(X);
return 0;
}
void fft(const cuDoubleComplex* x_h, int N, cuDoubleComplex* X_h)
{
cuDoubleComplex* X_d = nullptr, *x_d = nullptr;
size_t size = N * sizeof(cuDoubleComplex);
// allocate space for input and output arrays
cuda_malloc_with_error((void **)&X_d, size);
cuda_malloc_with_error((void **)&x_d, size);
// copy input array to device
cuda_memcpy_with_error((void *)x_d, (void *)x_h, size, cudaMemcpyHostToDevice);
unsigned int l2n = l2(N);
// time the fft computation only, without the memory copying from host to device and device to host
high_resolution_clock::time_point t1 = high_resolution_clock::now();
// s == 1
{
dim3 loop_vars(MAX_THREADS_PER_BLOCK, 1);
dim3 blocks(N / 2 / MAX_THREADS_PER_BLOCK, 1);
s0_butterfly<<<blocks, loop_vars>>>(x_d, X_d, l2n);
}
{
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cout << "Failed to launch s=1 butterfly kernel: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
}
// s > 1
for (size_t s = 2; s < N; s *= 2) {
dim3 loop_vars;
// weird logic to calculate the size of each thread block
if (N > 2 * MAX_THREADS_PER_BLOCK)
{
if (s > (N / 2/ s))
{
loop_vars.y = min_val(s, MAX_THREADS_PER_BLOCK);
loop_vars.x = max_val(MAX_THREADS_PER_BLOCK / loop_vars.y, 1);
}
else
{
loop_vars.x = min_val(N / 2 / s, MAX_THREADS_PER_BLOCK);
loop_vars.y = max_val(MAX_THREADS_PER_BLOCK / loop_vars.x, 1);
}
}
dim3 blocks((N / 2 / s) / loop_vars.x, s / loop_vars.y);
s1_butterfly<<<blocks, loop_vars>>>(s, X_d);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cout << "Failed to launch s>1 butterfly kernel: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto baseline_time = duration_cast<microseconds>(t2 - t1).count();
std::cout << "GPU FFT computation only (not including memory copies) took " << baseline_time << " microseconds" << std::endl;
// return result
cuda_memcpy_with_error(X_h, X_d, size, cudaMemcpyDeviceToHost);
// free device memory
cuda_free_with_error(X_d);
cuda_free_with_error(x_d);
}
__global__ void s0_butterfly(const cuDoubleComplex *x, cuDoubleComplex *X, unsigned int l2N)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
cuDoubleComplex f = make_cuDoubleComplex(1., 0.);
unsigned int i1 = 2 * i;
unsigned int i2 = 2 * i + 1;
unsigned int n1 = bit_reverse(i1, l2N);
unsigned int n2 = bit_reverse(i2, l2N);
cuDoubleComplex t1 = cuCadd(x[n1], cuCmul(f, x[n2]));
cuDoubleComplex t2 = cuCsub(x[n1], cuCmul(f, x[n2]));
X[i1] = t1;
X[i2] = t2;
}
__global__ void s1_butterfly(unsigned int s, cuDoubleComplex *X)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x, c = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex f = make_cuDoubleComplex(cos(-M_PI * (double)(c) / (double)(s)), sin(-M_PI * (double)(c) / (double)(s)));
unsigned int i1 = 2 * i * s + c;
unsigned int i2 = 2 * i * s + c + s;
cuDoubleComplex t1 = cuCadd(X[i1], cuCmul(f, X[i2]));
cuDoubleComplex t2 = cuCsub(X[i1], cuCmul(f, X[i2]));
X[i1] = t1;
X[i2] = t2;
}
__device__ size_t bit_reverse(size_t x, size_t bits)
{
size_t ret=0;
for (;bits > 0; bits--)
{
ret<<=1;
if (x&1)
ret++;
x>>=1;
}
return ret;
}
void cuda_malloc_with_error(void** ptr, size_t size)
{
cudaError_t err = cudaMalloc(ptr, size);
if (err != cudaSuccess)
{
std::cout << "Failed to allocate " << size << " bytes with error code " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
}
void cuda_memcpy_with_error(void* dst, void* src, size_t cnt, cudaMemcpyKind kind)
{
cudaError_t err = cudaMemcpy(dst, src, cnt, kind);
if (err != cudaSuccess)
{
std::cout << "Failed to copy vector from " << (kind == cudaMemcpyHostToDevice ? "host to device " : "device to host ") << cudaGetErrorString(err) << std::endl;
exit( EXIT_FAILURE);
}
}
void cuda_free_with_error(void* ptr)
{
cudaError_t err = cudaFree(ptr);
if (err != cudaSuccess)
{
std::cout << "Failed to free memory" << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
}
int l2(int x)
{
int r = 0;
while (x > 1)
{
++r;
x /= 2;
}
return r;
}
unsigned int min_val(unsigned int a, unsigned int b)
{
return a > b ? b : a;
}
unsigned int max_val(unsigned int a, unsigned int b)
{
return a > b ? a : b;
}
bool all_close(const std::vector<std::complex<double>>& a, const cuDoubleComplex* b, size_t N, double threshold)
{
for (size_t i = 0; i < min_val(a.size(), N); ++i)
{
if (std::abs(a[i].real() - b[i].x) > threshold or std::abs(a[i].imag() - b[i].y) > threshold)
{
return false;
}
}
return true;
}
std::complex<double> comp_exp(double th)
{
return {cos(th), sin(th)};
}
size_t bit_rev(size_t x, size_t bits)
{
size_t ret=0;
for (;bits > 0; bits--)
{
ret<<=1;
if (x&1)
ret++;
x>>=1;
}
return ret;
}
std::vector<std::complex<double>> fft_baseline(const std::vector<std::complex<double>>& x)
{
std::vector<std::complex<double>> X(x.size(), 0.);
std::complex<double> t1, t2, f;
size_t i1, i2, n1, n2;
for (size_t s = 1; s < x.size(); s*=2)
{
for (size_t i = 0; i < x.size() / 2 / s; i++)
{
for (size_t c = 0; c < s; c++) {
f = comp_exp(-M_PI * (double)c / (double)s);
i1 = 2 * i * s + c;
i2 = 2 * i * s + c + s;
if (s == 1)
{
n1 = bit_rev(i1, (size_t) l2((int)x.size()));
n2 = bit_rev(i2, (size_t) l2((int)x.size()));
t1 = x[n1] + f * x[n2];
t2 = x[n1] - f * x[n2];
X[i1] = t1;
X[i2] = t2;
}
else
{
t1 = X[i1] + f * X[i2];
t2 = X[i1] - f * X[i2];
X[i1] = t1;
X[i2] = t2;
}
}
}
}
return X;
}