-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmp4.cu
159 lines (128 loc) · 5.57 KB
/
mp4.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#include <wb.h>
#define wbCheck(stmt) \
do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "CUDA error: ", cudaGetErrorString(err)); \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while (0)
//@@ Define any useful program-wide constants here
#define TILE_WIDTH 3
#define RADIUS 1
#define INPUT_WIDTH 5
//@@ Define constant memory for device kernel here
// the shared cache
__constant__ float M[TILE_WIDTH * TILE_WIDTH * TILE_WIDTH];
// input should be the same size
__global__ void conv3d(float *input, float *output, const int z_size,
const int y_size, const int x_size) {
//@@ Insert kernel code here
// load data, using strategy 3, load the core, judge the boundary and read halo direct from the global memory
__shared__ float N_sds[INPUT_WIDTH][INPUT_WIDTH][INPUT_WIDTH];
int tx = threadIdx.x;
int ty = threadIdx.y;
int tz = threadIdx.z;
int bx = blockIdx.x * TILE_WIDTH;
int by = blockIdx.y * TILE_WIDTH;
int bz = blockIdx.z * TILE_WIDTH;
int kt = tz * TILE_WIDTH * TILE_WIDTH + ty * TILE_WIDTH + tx;
if (kt < INPUT_WIDTH * INPUT_WIDTH){
// calculate the tile index
int tile_x = kt % (INPUT_WIDTH);
int tile_y = (kt / INPUT_WIDTH) % INPUT_WIDTH;
for (int i =0; i < INPUT_WIDTH; ++i){
int z_start = bz - 1 + i;
if (bx + tile_x - 1 < x_size && bx + tile_x - 1 >= 0 && by + tile_y - 1 < y_size && by + tile_y - 1 >= 0 && z_start < z_size && z_start >= 0){
N_ds[bx + tile_x - 1][by + tile_y - 1][i] = input[z_start * (y_size * x_size) + (by + tile_y -1) * x_size + bx+tile_x-1];
}else {
N_ds[bx + tile_x - 1][by + tile_y - 1][i] = 0;
}
}
}
__syncthreads();
if (bx + tx >= 0 && bx + tx < x_size && by + ty >= 0 && by + ty < y_size && bz + tz >= 0 && bz + tz < z_size)
{
float P = 0;
for (int x = 0; x < TILE_WIDTH; x++){
for (int y = 0; y < TILE_WIDTH; y++){
for (int z = 0; z < TILE_WIDTH; z++){
P += N_ds[tx + x ][ty + y][tz + z] * M[z * (TILE_WIDTH * TILE_WIDTH) + y * (TILE_WIDTH) + x];
}
}
}
output[(tz + bz) * (y_size * x_size) + (ty + by) * (x_size) + (tx+bx)] = P;
}
}
int main(int argc, char *argv[]) {
wbArg_t args;
int z_size;
int y_size;
int x_size;
int inputLength, kernelLength;
float *hostInput;
float *hostKernel;
float *hostOutput;
float *deviceInput;
float *deviceOutput;
args = wbArg_read(argc, argv);
// Import data
hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &inputLength);
hostKernel =
(float *)wbImport(wbArg_getInputFile(args, 1), &kernelLength);
hostOutput = (float *)malloc(inputLength * sizeof(float));
// First three elements are the input dimensions
z_size = hostInput[0];
y_size = hostInput[1];
x_size = hostInput[2];
wbLog(TRACE, "The input size is ", z_size, "x", y_size, "x", x_size);
// Recall that inputLength is 3 elements longer than the input data
// because the first three elements were the dimensions
assert(z_size * y_size * x_size == inputLength - 3);
assert(kernelLength == 27);
// set the TILE_LENGTH == 3 * 3 * 3
int matrix_len = inputLength - 3;
int matrix_size = matrix_len * sizeof(float);
wbTime_start(GPU, "Doing GPU Computation (memory + compute)");
wbTime_start(GPU, "Doing GPU memory allocation");
//@@ Allocate GPU memory here
// Recall that inputLength is 3 elements longer than the input data
// because the first three elements were the dimensions
cudaMalloc((void **)&deviceInput, matrix_size);
cudaMalloc((void **)&deviceOutput, matrix_size);
wbTime_stop(GPU, "Doing GPU memory allocation");
wbTime_start(Copy, "Copying data to the GPU");
//@@ Copy input and kernel to GPU here
cudaMemcpy(deviceInput, hostInput + 3, matrix_size, cudaMemcpyHostToDevice);
// constant mem declared elsewhere
cudaMemcpyToSymbol(M, hostKernel, kernelLength * sizeof(float), 0, cudaMemcpyHostToDevice);
wbTime_stop(Copy, "Copying data to the GPU");
wbTime_start(Compute, "Doing the computation on the GPU");
//@@ Initialize grid and block dimensions here
dim3 GridDim(ceil((1.0 * x_size)/TILE_WIDTH), ceil((1.0 * y_size)/TILE_WIDTH), ceil((1.0 * z_size)/TILE_WIDTH));
dim3 BlockDim(TILE_WIDTH, TILE_WIDTH, TILE_WIDTH);
//@@ Launch the GPU kernel here
conv3d<<<GridDim, BlockDim>>>(deviceInput, deviceOutput, z_size, y_size, x_size);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Doing the computation on the GPU");
wbTime_start(Copy, "Copying data from the GPU");
//@@ Copy the device memory back to the host here
// Recall that the first three elements of the output are the dimensions
// and should not be set here (they are set below)
cudaMemcpy(hostOutput+3, deviceOutput, matrix_size, cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying data from the GPU");
wbTime_stop(GPU, "Doing GPU Computation (memory + compute)");
// Set the output dimensions for correctness checking
hostOutput[0] = z_size;
hostOutput[1] = y_size;
hostOutput[2] = x_size;
wbSolution(args, hostOutput, inputLength);
// Free device memory
cudaFree(deviceInput);
cudaFree(deviceOutput);
// Free host memory
free(hostInput);
free(hostOutput);
return 0;
}