Skip to content

Commit

Permalink
#5 Added original 1d gauss for comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
carljohnsen committed Jul 27, 2024
1 parent f5ecec4 commit 877742a
Showing 1 changed file with 36 additions and 0 deletions.
36 changes: 36 additions & 0 deletions src/lib/cpp/gpu/diffusion.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,42 @@ namespace gpu {
}
}
}

void diffusion_core_og(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t dim, const int32_t radius) {
// Note: the use of 32-bit is intentional to reduce register pressure on GPU. Each of the 32-bit values shouldn't exceed 2^32, but the indices (address) to the arrays can be.

const int32_t
strides[3] = {(N.y)*(N.x), N.x, 1},
stride = strides[dim],
Ns[3] = {N.z, N.y, N.x};

#pragma acc parallel loop collapse(3) present(input, kernel, output)
for (int32_t i = 0; i < N.z; i++) {
for (int32_t j = 0; j < N.y; j++) {
for (int32_t k = 0; k < N.x; k++) {
const int32_t
X[3] = {i, j, k},
ranges[2] = {
-std::min(radius, X[dim]), std::min(radius, Ns[dim]-X[dim]-1)
};
const int64_t output_index = (uint64_t)i*(uint64_t)strides[0] + (uint64_t)j*(uint64_t)strides[1] + (uint64_t)k*(uint64_t)strides[2];

float sum = 0.0f;

for (int32_t r = -radius; r <= radius; r++) {
const int64_t input_index = output_index + (int64_t)r*(int64_t)stride;
bool cond = r >= ranges[0] && r <= ranges[1];

// Original:
float val = cond ? input[input_index] : 0.0f;
sum += val * kernel[radius+r];
}

output[output_index] = sum;
}
}
}
}
void diffusion_core_x(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) {
// Note: the use of 32-bit is intentional to reduce register pressure on GPU. Each of the 32-bit values shouldn't exceed 2^32, but the indices (address) to the arrays can be.
constexpr int veclen = 128;
Expand Down

0 comments on commit 877742a

Please sign in to comment.