From 877742a98ef0e9ef2d6fe19f78706654a5b4f3df Mon Sep 17 00:00:00 2001 From: Carl Johnsen Date: Sat, 27 Jul 2024 05:36:50 +0000 Subject: [PATCH] #5 Added original 1d gauss for comparison --- src/lib/cpp/gpu/diffusion.cc | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/lib/cpp/gpu/diffusion.cc b/src/lib/cpp/gpu/diffusion.cc index f522325..97c07f2 100644 --- a/src/lib/cpp/gpu/diffusion.cc +++ b/src/lib/cpp/gpu/diffusion.cc @@ -79,6 +79,42 @@ namespace gpu { } } } + + void diffusion_core_og(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t dim, const int32_t radius) { + // Note: the use of 32-bit is intentional to reduce register pressure on GPU. Each of the 32-bit values shouldn't exceed 2^32, but the indices (address) to the arrays can be. + + const int32_t + strides[3] = {(N.y)*(N.x), N.x, 1}, + stride = strides[dim], + Ns[3] = {N.z, N.y, N.x}; + + #pragma acc parallel loop collapse(3) present(input, kernel, output) + for (int32_t i = 0; i < N.z; i++) { + for (int32_t j = 0; j < N.y; j++) { + for (int32_t k = 0; k < N.x; k++) { + const int32_t + X[3] = {i, j, k}, + ranges[2] = { + -std::min(radius, X[dim]), std::min(radius, Ns[dim]-X[dim]-1) + }; + const int64_t output_index = (uint64_t)i*(uint64_t)strides[0] + (uint64_t)j*(uint64_t)strides[1] + (uint64_t)k*(uint64_t)strides[2]; + + float sum = 0.0f; + + for (int32_t r = -radius; r <= radius; r++) { + const int64_t input_index = output_index + (int64_t)r*(int64_t)stride; + bool cond = r >= ranges[0] && r <= ranges[1]; + + // Original: + float val = cond ? input[input_index] : 0.0f; + sum += val * kernel[radius+r]; + } + + output[output_index] = sum; + } + } + } + } void diffusion_core_x(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) { // Note: the use of 32-bit is intentional to reduce register pressure on GPU. Each of the 32-bit values shouldn't exceed 2^32, but the indices (address) to the arrays can be. constexpr int veclen = 128;