From e144ec126da7361f87c328bd6f35499e3a8bcac1 Mon Sep 17 00:00:00 2001 From: Carl Johnsen Date: Mon, 29 Jul 2024 13:33:38 +0000 Subject: [PATCH] #5 Added diffusion_core_z --- src/lib/cpp/gpu/diffusion.cc | 60 ++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/src/lib/cpp/gpu/diffusion.cc b/src/lib/cpp/gpu/diffusion.cc index 6bb4c2f..95db85f 100644 --- a/src/lib/cpp/gpu/diffusion.cc +++ b/src/lib/cpp/gpu/diffusion.cc @@ -116,6 +116,60 @@ namespace gpu { } } + // TODO Make this call y, since they're VERY similar? + void diffusion_core_z(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) { + // Assumes that the x dimension is a multiple of veclen. + constexpr int32_t + worklen = 1, + veclen = 64, + max_k = 32, + sqvec = max_k*veclen; + const int32_t + kernel_size = 2*radius+1, + nz = N.z, ny = N.y, nx = N.x; + #pragma acc parallel vector_length(veclen) num_workers(worklen) present(input, kernel, output) + { + #pragma acc loop gang collapse(2) + for (int32_t y = 0; y < ny; y++) { + //#pragma acc loop worker + for (int32_t x = 0; x < nx; x += veclen) { + float local[sqvec], local_kernel[max_k]; // Local memory. + #pragma acc cache(local_kernel, local) + { + #pragma acc loop vector + for (int32_t tid = 0; tid < veclen; tid++) { + #pragma acc loop seq + for (int32_t z = 0; z < radius; z++) { + local[z*veclen + tid] = 0; // Zero out the local memory. + } + #pragma acc loop seq + for (int32_t z = radius; z < kernel_size; z++) { + local[z*veclen + tid] = input[(z-radius)*ny*nx + y*nx + x + tid]; + } + // Load the kernel into the local memory. + local_kernel[tid] = tid < kernel_size ? kernel[tid] : 0; + } + #pragma acc loop seq + for (int32_t z = 0; z < nz; z++) { + #pragma acc loop vector + for (int32_t tid = 0; tid < veclen; tid++) { + float sum = local[tid] * local_kernel[0]; + #pragma acc loop seq + for (int32_t r = 1; r < kernel_size; r++) { + float val = local[r*veclen + tid]; + sum += val * local_kernel[r]; + local[(r-1)*veclen + tid] = val; + } + output[z*ny*nx + y*nx + x + tid] = sum; + local[(kernel_size-1)*veclen + tid] = z+radius+1 < nz ? input[(z+radius+1)*ny*nx + y*nx + x + tid] : 0; + } + } + } + } + } + } + } + void diffusion_core_y(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) { // Assumes that the x dimension is a multiple of veclen. constexpr int32_t @@ -480,11 +534,11 @@ namespace gpu { constexpr int32_t veclen = 64; const shape_t P = { N.z, N.y, (N.x + veclen-1) / veclen * veclen }; //for (int64_t dim = 0; dim < 3; dim++) { - // diffusion_core_og(buf0, kernel, buf1, N, dim, radius); + // diffusion_core(buf0, kernel, buf1, N, dim, radius); // std::swap(buf0, buf1); //} - diffusion_core_y(buf0, kernel, buf1, P, radius); - std::swap(buf0, buf1); + diffusion_core_z(buf0, kernel, buf1, P, radius); + diffusion_core_y(buf1, kernel, buf0, P, radius); diffusion_core_x(buf0, kernel, buf1, P, radius); std::swap(buf0, buf1); illuminate(voxels, buf0, N, P);