-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.cu
106 lines (81 loc) · 3.13 KB
/
test.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <stdio.h>
// these are just for timing measurments
#include <time.h>
// error checking macro
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
const int DSIZE = 32;
const int block_size = 16; // CUDA maximum is 1024 *total* threads in block
const float A_val = 1.0f;
const float B_val = 2.0f;
// matrix multiply (naive) kernel: C = A * B
__global__ void mmul(const float *A, const float *B, float *C, int ds) {
int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
int idy = threadIdx.y+blockDim.y*blockIdx.y; // create thread y index
if ((idx < ds) && (idy < ds)){
float temp = 0;
for (int i = 0; i < ds; i++)
temp += A[idx*ds+i] * B[i*ds+idy]; // dot product of row and column
C[idy*ds+idx] = temp;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
// these are just for timing
clock_t t0, t1, t2;
double t1sum=0.0;
double t2sum=0.0;
// start timing
t0 = clock();
h_A = new float[DSIZE*DSIZE];
h_B = new float[DSIZE*DSIZE];
h_C = new float[DSIZE*DSIZE];
for (int i = 0; i < DSIZE*DSIZE; i++){
h_A[i] = A_val;
h_B[i] = B_val;
h_C[i] = 0;}
// Initialization timing
t1 = clock();
t1sum = ((double)(t1-t0))/CLOCKS_PER_SEC;
printf("Init took %f seconds. Begin compute\n", t1sum);
// Allocate device memory and copy input data over to GPU
cudaMalloc(&d_A, DSIZE*DSIZE*sizeof(float));
cudaMalloc(&d_B, DSIZE*DSIZE*sizeof(float));
cudaMalloc(&d_C, 4*DSIZE*DSIZE*sizeof(float));
cudaCheckErrors("cudaMalloc failure");
cudaMemcpy(d_A, h_A, DSIZE*DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D failure");
cudaMemcpy(d_B, h_B, DSIZE*DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D failure");
// Cuda processing sequence step 1 is complete
// Launch kernel
dim3 block(block_size, block_size); // dim3 variable holds 3 dimensions
dim3 grid((DSIZE+block.x-1)/block.x, (DSIZE+block.y-1)/block.y);
mmul<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
cudaCheckErrors("kernel launch failure");
// Cuda processing sequence step 2 is complete
// Copy results back to host
cudaMemcpy(h_C, d_C, 4*DSIZE*DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy D2H failure");
//PRINT c
// for (int i = 0; i < 4*DSIZE*DSIZE; i++) printf("%f ",h_C[i]);
// GPU timing
t2 = clock();
t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
printf ("Done. Compute took %f seconds\n", t2sum);
// Cuda processing sequence step 3 is complete
// Verify results
cudaCheckErrors("kernel execution failure or cudaMemcpy H2D failure");
for (int i = 0; i < DSIZE*DSIZE; i++) if (h_C[i] != A_val*B_val*DSIZE) {printf("mismatch at index %d, was: %f, should be: %f\n", i, h_C[i], A_val*B_val*DSIZE); return -1;}
printf("Success!\n");
return 0;
}