From f5ecec4c730429e653f27f8a1b563d66a638514e Mon Sep 17 00:00:00 2001 From: Carl Johnsen Date: Sat, 27 Jul 2024 05:36:17 +0000 Subject: [PATCH] #5 Added x-axis caching edition of the 1d gauss kernel. --- src/lib/cpp/gpu/diffusion.cc | 41 +++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/src/lib/cpp/gpu/diffusion.cc b/src/lib/cpp/gpu/diffusion.cc index dd3a310..f522325 100644 --- a/src/lib/cpp/gpu/diffusion.cc +++ b/src/lib/cpp/gpu/diffusion.cc @@ -79,6 +79,44 @@ namespace gpu { } } } + void diffusion_core_x(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) { + // Note: the use of 32-bit is intentional to reduce register pressure on GPU. Each of the 32-bit values shouldn't exceed 2^32, but the indices (address) to the arrays can be. + constexpr int veclen = 128; + float local[3*veclen], local_kernel[veclen]; // Local memory. + #pragma acc parallel vector_length(veclen) num_workers(1) present(input, kernel, output) + { + #pragma acc loop gang + for (int32_t i = 0; i < N.z; i++) { + #pragma acc loop worker private(local, local_kernel) + for (int32_t j = 0; j < N.y; j++) { + const uint64_t output_start = (uint64_t)i*(uint64_t)N.y*(uint64_t)N.x + (uint64_t)j*(uint64_t)N.x; + #pragma acc cache(local, local_kernel) + { + for (int32_t k = 0; k < N.x; k += veclen) { + #pragma acc loop vector + for (int32_t tid = 0; tid < veclen; tid++) { + const int64_t output_index = output_start + (int64_t)k + (int64_t)tid; + local[tid + 0*veclen] = k+tid >= veclen ? input[output_index - veclen] : 0; + local[tid + 1*veclen] = input[output_index]; + local[tid + 2*veclen] = k+tid + veclen < N.x ? input[output_index + veclen] : 0; + local_kernel[tid] = tid < (2*radius+1) ? kernel[tid] : 0; + } + #pragma acc loop vector + for (int32_t tid = 0; tid < veclen; tid++) { + const int64_t output_index = output_start + (int64_t)k + (int64_t)tid; + float sum = 0.0f; + for (int32_t r = -radius; r <= radius; r++) { + sum += local[tid + veclen + r] * local_kernel[r+radius]; + } + if (k+tid < N.x) + output[output_index] = sum; + } + } + } + } + } + } + } // padding is padding*ny*nx - i.e. number of padding layers in flat size template @@ -277,7 +315,8 @@ namespace gpu { for (int64_t dim = 0; dim < 3; dim++) { diffusion_core(buf0, kernel, buf1, N, dim, radius); std::swap(buf0, buf1); - } + //diffusion_core_x(buf0, kernel, buf1, N, radius); + //std::swap(buf0, buf1); illuminate(voxels, buf0, N.z*N.y*N.x); }