-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtest_cpu_overhead.cu
More file actions
99 lines (77 loc) · 3.28 KB
/
test_cpu_overhead.cu
File metadata and controls
99 lines (77 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#include <chrono>
#include <cuda_runtime.h>
#include <iostream>
#define CUDA_CHECK(call) \
do { \
cudaError_t e = call; \
if (e != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(e) << " at " \
<< __FILE__ << ":" << __LINE__ << std::endl; \
std::exit(1); \
} \
} while (0)
__global__ void empty_kernel() {}
__global__ void busy_kernel(float *A, size_t N) {
auto local_x = blockIdx.x * blockDim.x + threadIdx.x;
if (local_x < N)
A[local_x] += 1;
}
int main() {
const int N = 10000;
using clock = std::chrono::high_resolution_clock;
// force context creation
CUDA_CHECK(cudaFree(0));
void *hptr = nullptr;
void *dptr = nullptr;
size_t total_bytes = 4 * 1 << 20; // 1024 * 1024 * 1024 -> 2^30
// total_bytes = 576 * 2 * 128;
CUDA_CHECK(cudaMallocHost(&hptr, total_bytes));
CUDA_CHECK(cudaMalloc(&dptr, total_bytes));
dim3 threads (256);
int num_of_float = total_bytes / sizeof(float);
dim3 blocks ((num_of_float + 256 - 1) / 256);
// at first warmup a little bit
// for (int i = 0; i < 100; ++i) {
// empty_kernel<<<1,1>>>();
// busy_kernel<<<blocks, threads>>>((float*) dptr, total_bytes / sizeof(float));
// CUDA_CHECK(cudaMemcpyAsync(dptr, hptr, 0, cudaMemcpyHostToDevice));
// CUDA_CHECK(cudaMemcpyAsync(dptr, hptr, total_bytes, cudaMemcpyHostToDevice));
// }
// 1) first measure plain enqueue+sync
auto t0 = clock::now();
cudaStream_t s0;
CUDA_CHECK(cudaStreamCreate(&s0));
for (int i = 0; i < N; i++) {
// busy_kernel<<<blocks, threads, 0, s0>>>((float*) dptr, total_bytes / sizeof(float));
cudaMemcpyAsync(dptr, hptr, total_bytes, cudaMemcpyHostToDevice, s0);
}
double host_us =
std::chrono::duration<double>(clock::now() - t0).count() * 1e6;
std::cout << "Plain enqueue: " << host_us << " us\n";
CUDA_CHECK(cudaStreamSynchronize(s0));
// 2) build the CUDA Graph
cudaStream_t s;
CUDA_CHECK(cudaStreamCreate(&s));
cudaGraph_t graph;
cudaGraphExec_t graphExec;
CUDA_CHECK(cudaStreamBeginCapture(s, cudaStreamCaptureModeGlobal));
for (int i = 0; i < N; i++) {
// busy_kernel<<<blocks, threads, 0, s>>>((float*) dptr, total_bytes / sizeof(float));
cudaMemcpyAsync(dptr, hptr, total_bytes, cudaMemcpyHostToDevice, s);
}
CUDA_CHECK(cudaStreamEndCapture(s, &graph));
CUDA_CHECK(cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
// 3) launch the graph once and time it
t0 = clock::now();
CUDA_CHECK(cudaGraphLaunch(graphExec, s));
double graph_us =
std::chrono::duration<double>(clock::now() - t0).count() * 1e6;
std::cout << "Graph-launched: " << graph_us << " us\n";
std::cout << "Delta T in us: " << host_us - graph_us << " us\n";
CUDA_CHECK(cudaStreamSynchronize(s));
// cleanup
CUDA_CHECK(cudaGraphExecDestroy(graphExec));
CUDA_CHECK(cudaGraphDestroy(graph));
CUDA_CHECK(cudaStreamDestroy(s));
return 0;
}