From 9707bbda30456dbbc3d11c78058bda0de6917dc9 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Mon, 29 Jul 2024 13:19:30 +0000
Subject: [PATCH] #5 diffusion_core_x now exploits aligned x axis. Plus,
 increased the number of launched blocks.

---
 src/lib/cpp/gpu/diffusion.cc | 64 +++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/src/lib/cpp/gpu/diffusion.cc b/src/lib/cpp/gpu/diffusion.cc
index b1f0748..6bb4c2f 100644
--- a/src/lib/cpp/gpu/diffusion.cc
+++ b/src/lib/cpp/gpu/diffusion.cc
@@ -171,39 +171,65 @@ namespace gpu {
 
     void diffusion_core_x(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) {
         // Note: the use of 32-bit is intentional to reduce register pressure on GPU. Each of the 32-bit values shouldn't exceed 2^32, but the indices (address) to the arrays can be.
-        constexpr int32_t
-            veclen = 32,
-            max_k = 32;
+        constexpr int32_t veclen = 32;
         float local[3*veclen], local_kernel[veclen]; // Local memory.
+        const int32_t nz = N.z, ny = N.y, nx = N.x;
         #pragma acc parallel vector_length(veclen) num_workers(1) present(input, kernel, output)
         {
-            #pragma acc loop gang
-            for (int32_t z = 0; z < N.z; z++) {
-                #pragma acc loop worker private(local, local_kernel)
-                for (int32_t y = 0; y < N.y; y++) {
-                    const uint64_t output_start = (uint64_t)z*(uint64_t)N.y*(uint64_t)N.x + (uint64_t)y*(uint64_t)N.x;
+            #pragma acc loop gang collapse(2)
+            for (int32_t z = 0; z < nz; z++) {
+                //#pragma acc loop worker private(local, local_kernel)
+                for (int32_t y = 0; y < ny; y++) {
                     #pragma acc cache(local, local_kernel)
                     {
-                        for (int32_t x = 0; x < N.x; x += veclen) {
+                        #pragma acc loop vector
+                        for (int32_t tid = 0; tid < veclen; tid++) {
+                            const int64_t output_index = z*ny*nx + y*nx + tid; // x = 0.
+                            local[0*veclen + tid] = 0;
+                            local[1*veclen + tid] = input[output_index];
+                            local[2*veclen + tid] = input[output_index + veclen];
+                            local_kernel[tid] = tid < (2*radius+1) ? kernel[tid] : 0;
+                        }
+                        #pragma acc loop vector
+                        for (int32_t tid = 0; tid < veclen; tid++) {
+                            float sum = 0.0f;
+                            #pragma acc loop seq
+                            for (int32_t r = -radius; r <= radius; r++) {
+                                sum += local[tid + veclen + r] * local_kernel[r+radius];
+                            }
+                            output[z*ny*nx + y*nx + tid] = sum; // x = 0.
+                        }
+                        for (int32_t x = 1; x < nx-1; x += veclen) {
                             #pragma acc loop vector
                             for (int32_t tid = 0; tid < veclen; tid++) {
-                                const int64_t output_index = output_start + (int64_t)x + (int64_t)tid;
-                                local[tid + 0*veclen] = x+tid >= veclen ?      input[output_index - veclen] : 0;
-                                local[tid + 1*veclen] =                    input[output_index];
-                                local[tid + 2*veclen] = x+tid + veclen < N.x ? input[output_index + veclen] : 0;
-                                local_kernel[tid] = tid < (2*radius+1) ? kernel[tid] : 0;
+                                local[0*veclen + tid] = local[1*veclen + tid];
+                                local[1*veclen + tid] = local[2*veclen + tid];
+                                local[2*veclen + tid] = input[z*ny*nx + y*nx + x + tid + veclen];
                             }
                             #pragma acc loop vector
                             for (int32_t tid = 0; tid < veclen; tid++) {
-                                const int64_t output_index = output_start + (int64_t)x + (int64_t)tid;
                                 float sum = 0.0f;
                                 #pragma acc loop seq
                                 for (int32_t r = -radius; r <= radius; r++) {
                                     sum += local[tid + veclen + r] * local_kernel[r+radius];
                                 }
-                                if (x+tid < N.x)
-                                    output[output_index] = sum;
+                                output[z*ny*nx + y*nx + x + tid] = sum;
+                            }
+                        }
+                        #pragma acc loop vector
+                        for (int32_t tid = 0; tid < veclen; tid++) {
+                            local[0*veclen + tid] = local[1*veclen + tid];
+                            local[1*veclen + tid] = local[2*veclen + tid];
+                            local[2*veclen + tid] = 0;
+                        }
+                        #pragma acc loop vector
+                        for (int32_t tid = 0; tid < veclen; tid++) {
+                            float sum = 0.0f;
+                            #pragma acc loop seq
+                            for (int32_t r = -radius; r <= radius; r++) {
+                                sum += local[tid + veclen + r] * local_kernel[r+radius];
                             }
+                            output[z*ny*nx + y*nx + nx-1 + tid] = sum;
                         }
                     }
                 }
@@ -451,7 +477,7 @@ namespace gpu {
     }
 
     void diffusion_step(const uint8_t *__restrict__ voxels, float *buf0, float *buf1, const shape_t &N, const float *__restrict__ kernel, const int64_t radius) {
-        constexpr int32_t veclen = 256;
+        constexpr int32_t veclen = 64;
         const shape_t P = { N.z, N.y, (N.x + veclen-1) / veclen * veclen };
         //for (int64_t dim = 0; dim < 3; dim++) {
         //    diffusion_core_og(buf0, kernel, buf1, N, dim, radius);
@@ -478,7 +504,7 @@ namespace gpu {
     }
 
     void diffusion_in_memory(const uint8_t *__restrict__ voxels, const shape_t &N, const float *__restrict__ kernel, const int64_t kernel_size, const int64_t repititions, uint16_t *__restrict__ output) {
-        constexpr int32_t veclen = 256; // TODO
+        constexpr int32_t veclen = 64; // TODO
         const shape_t P = { N.z, N.y, (N.x + veclen-1) / veclen * veclen };
         const int64_t
             padded_size = P.z*P.y*P.x,