-
Notifications
You must be signed in to change notification settings - Fork 0
/
kernel.cu
45 lines (35 loc) · 1.46 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#define N_ITERATIONS 8
#define BLOCK_WIDTH 32
#define BLOCK_HEIGHT 32
#define MIN(a,b) (((a)<(b))?(a):(b))
#define MAX(a,b) (((a)>(b))?(a):(b))
__global__ void mySolve(const int *matrix, float *col, float *row, const int height, const int width) {
size_t laneID = threadIdx.x & 31;
size_t x = blockDim.x * blockIdx.x + threadIdx.x; // absolutni X pozice v matici
size_t y = 32 * blockIdx.y + N_ITERATIONS * threadIdx.y; // absolutni Y pozice pocatku pocitani
size_t idx = y * width + x;
int vertical = 0;
for(int i = 0; i < N_ITERATIONS; ++i) {
int v = matrix[idx];
vertical += v;
for(unsigned w = 16; w >= 1; w >>= 1)
v += __shfl_down(v, w);
if(laneID == 0)
atomicAdd(&row[y+i], v);
idx += width;
}
atomicAdd(&col[x], vertical);
}
__global__ void bothAverage(float* first, float *second, const int firstLen, const int secondLen) {
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < firstLen)
first[index] = first[index] / secondLen;
if (index < secondLen)
second[index] = second[index] / firstLen;
}
void solveGPU(const int *matrix, float *avg_stud, float *avg_que, const int students, const int questions) {
cudaMemset(avg_que, 0, questions*sizeof(*avg_que));
cudaMemset(avg_stud, 0, students*sizeof(*avg_stud));
mySolve<<<dim3(questions/32, students/32), dim3(32, 32/N_ITERATIONS)>>>(matrix, avg_que, avg_stud, students, questions);
bothAverage<<<dim3(MAX(students,questions)/32), dim3(32, 1)>>>(avg_que, avg_stud, questions, students);
}