From 4e21798f074bfd9963ab40f911f143fe45c75944 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 30 Jul 2024 09:22:22 +0000
Subject: [PATCH] #5 Implemented the new diffusion core functions in the
 out-of-core implementation

---
 src/lib/cpp/gpu/diffusion.cc | 147 +++++++++++++++++++----------------
 1 file changed, 82 insertions(+), 65 deletions(-)

diff --git a/src/lib/cpp/gpu/diffusion.cc b/src/lib/cpp/gpu/diffusion.cc
index f6e6bc1..483a3f7 100644
--- a/src/lib/cpp/gpu/diffusion.cc
+++ b/src/lib/cpp/gpu/diffusion.cc
@@ -238,21 +238,21 @@ namespace gpu {
                     {
                         { // First iteration
                             int32_t x = 0;
-                        #pragma acc loop vector
-                        for (int32_t tid = 0; tid < veclen; tid++) {
+                            #pragma acc loop vector
+                            for (int32_t tid = 0; tid < veclen; tid++) {
                                 const int64_t input_index = z*ny*nx + y*nx + x + tid;
-                            local[0*veclen + tid] = 0;
+                                local[0*veclen + tid] = 0;
                                 local[1*veclen + tid] = input[input_index];
                                 local[2*veclen + tid] = input[input_index + veclen];
-                            local_kernel[tid] = tid < (2*radius+1) ? kernel[tid] : 0;
-                        }
-                        #pragma acc loop vector
-                        for (int32_t tid = 0; tid < veclen; tid++) {
-                            float sum = 0.0f;
-                            #pragma acc loop seq
-                            for (int32_t r = -radius; r <= radius; r++) {
-                                sum += local[tid + veclen + r] * local_kernel[r+radius];
+                                local_kernel[tid] = tid < (2*radius+1) ? kernel[tid] : 0;
                             }
+                            #pragma acc loop vector
+                            for (int32_t tid = 0; tid < veclen; tid++) {
+                                float sum = 0.0f;
+                                #pragma acc loop seq
+                                for (int32_t r = -radius; r <= radius; r++) {
+                                    sum += local[tid + veclen + r] * local_kernel[r+radius];
+                                }
                                 output[z*ny*nx + y*nx + x + tid] = sum;
                             }
                         }
@@ -275,19 +275,19 @@ namespace gpu {
                         }
                         { // Last iteration
                             int32_t x = nx-veclen;
-                        #pragma acc loop vector
-                        for (int32_t tid = 0; tid < veclen; tid++) {
-                            local[0*veclen + tid] = local[1*veclen + tid];
-                            local[1*veclen + tid] = local[2*veclen + tid];
-                            local[2*veclen + tid] = 0;
-                        }
-                        #pragma acc loop vector
-                        for (int32_t tid = 0; tid < veclen; tid++) {
-                            float sum = 0.0f;
-                            #pragma acc loop seq
-                            for (int32_t r = -radius; r <= radius; r++) {
-                                sum += local[tid + veclen + r] * local_kernel[r+radius];
+                            #pragma acc loop vector
+                            for (int32_t tid = 0; tid < veclen; tid++) {
+                                local[0*veclen + tid] = local[1*veclen + tid];
+                                local[1*veclen + tid] = local[2*veclen + tid];
+                                local[2*veclen + tid] = 0;
                             }
+                            #pragma acc loop vector
+                            for (int32_t tid = 0; tid < veclen; tid++) {
+                                float sum = 0.0f;
+                                #pragma acc loop seq
+                                for (int32_t r = -radius; r <= radius; r++) {
+                                    sum += local[tid + veclen + r] * local_kernel[r+radius];
+                                }
                                 output[z*ny*nx + y*nx + x + tid] = sum;
                             }
                         }
@@ -546,7 +546,7 @@ namespace gpu {
         diffusion_core_z(buf0, kernel, buf1, P, radius);
         diffusion_core_y(buf1, kernel, buf0, P, radius);
         diffusion_core_x(buf0, kernel, buf1, P, radius);
-            std::swap(buf0, buf1);
+        std::swap(buf0, buf1);
         illuminate(voxels, buf0, N, P);
     }
 
@@ -724,16 +724,20 @@ namespace gpu {
     }
 
     void diffusion_out_of_core(const uint8_t *__restrict__ voxels, const shape_t &total_shape, const shape_t &global_shape, const float *__restrict__ kernel, const int64_t kernel_size, const int64_t repititions, uint16_t *__restrict__ output) {
-        const shape_t global_shape_padded = {global_shape.z+kernel_size-1, global_shape.y, global_shape.x};
+        constexpr int32_t veclen = 64; // TODO
+        const shape_t
+            total_shape_padded = {total_shape.z, total_shape.y, (total_shape.x + veclen - 1) / veclen * veclen},
+            global_shape_padded = {global_shape.z+kernel_size-1, global_shape.y, (global_shape.x + veclen - 1) / veclen * veclen};
         const int64_t
             total_size = total_shape.z*total_shape.y*total_shape.x,
+            total_size_padded = total_shape_padded.z*total_shape_padded.y*total_shape_padded.x,
             global_size = global_shape.z*global_shape.y*global_shape.x,
             global_size_padded = global_shape_padded.z*global_shape_padded.y*global_shape_padded.x,
             radius = kernel_size / 2,
             blocks = (total_shape.z + global_shape.z - 1) / global_shape.z;
 
-        float *buf0 = new float[total_size];
-        float *buf1 = new float[total_size];
+        float *buf0 = new float[total_size_padded];
+        float *buf1 = new float[total_size_padded];
         float *buf0_stage = new float[global_size_padded];
         float *buf1_stage = new float[global_size_padded];
         uint8_t *mask = new uint8_t[global_size_padded];
@@ -741,74 +745,82 @@ namespace gpu {
         for (int64_t start_z = 0; start_z < total_shape.z; start_z += global_shape.z) {
             const int64_t
                 end_z = std::min(total_shape.z, start_z + global_shape.z),
-                this_block_size = (end_z - start_z) * total_shape.y * total_shape.x;
+                this_z = end_z - start_z;
+            const shape_t
+                this_shape = {this_z, global_shape.y, global_shape.x},
+                this_shape_padded = {this_z, global_shape_padded.y, global_shape_padded.x};
+            const int64_t
+                this_block_size = this_shape.z * this_shape.y * this_shape.x,
+                this_block_size_padded = this_shape_padded.z * this_shape_padded.y * this_shape_padded.x;
             const uint8_t *this_voxels = voxels + (start_z * total_shape.y * total_shape.x);
-            float *this_buf0 = buf0 + (start_z * total_shape.y * total_shape.x);
-            #pragma acc data copyin(this_voxels[:this_block_size]) create(this_buf0[:this_block_size]) copyout(this_buf0[:this_block_size])
+            float *this_buf0 = buf0 + (start_z * total_shape_padded.y * total_shape_padded.x);
+            #pragma acc data copyin(this_voxels[:this_block_size]) create(this_buf0[:this_block_size_padded]) copyout(this_buf0[:this_block_size_padded])
             {
-                convert_uint8_to_float(this_voxels, this_buf0, this_block_size);
+                // N P
+                convert_uint8_to_float(this_voxels, this_buf0, this_shape, this_shape_padded);
             }
         }
         const std::string debug_newline = DEBUG ? "\n" : "";
 
-        #pragma acc data copyin(kernel[:kernel_size])
+        #pragma acc data copyin(kernel[:kernel_size]) // TODO pad kernel?
         {
         for (int64_t rep = 0; rep < repititions; rep++) {
             for (int64_t block = 0; block < blocks; block++) {
                 std::cout << "\rDiffusion: " << rep*blocks + block << "/" << repititions*blocks << debug_newline << std::flush;
 
                 const int64_t
-                    disk_start_z = block * global_shape.z,
-                    disk_end_z = std::min(total_shape.z, disk_start_z + global_shape.z),
-                    padding_front = std::min((int64_t) radius, disk_start_z),
-                    padding_back = std::min((int64_t) radius, total_shape.z - disk_end_z),
-                    disk_start_z_padded = disk_start_z - padding_front,
-                    disk_end_z_padded = disk_end_z + padding_back,
-                    this_z = disk_end_z - disk_start_z,
-                    this_z_padded = disk_end_z_padded - disk_start_z_padded,
+                    global_start_z = block * global_shape.z,
+                    global_end_z = std::min(total_shape.z, global_start_z + global_shape.z),
+                    padding_front = std::min((int64_t) radius, global_start_z),
+                    padding_back = std::min((int64_t) radius, total_shape.z - global_end_z),
+                    global_start_z_padded = global_start_z - padding_front,
+                    global_end_z_padded = global_end_z + padding_back,
+                    this_z = global_end_z - global_start_z,
+                    this_z_padded = global_end_z_padded - global_start_z_padded,
                     leading_zeros_z = radius - padding_front,
-                    trailing_zeros_z = disk_end_z_padded == total_shape.z ? global_shape_padded.z - this_z_padded : radius - padding_back,
-                    leading_zeros = leading_zeros_z * total_shape.y * total_shape.x,
-                    trailing_zeros = trailing_zeros_z * total_shape.y * total_shape.x,
-                    this_block_size = this_z * total_shape.y * total_shape.x,
-                    this_block_size_padded = this_z_padded * total_shape.y * total_shape.x;
+                    trailing_zeros_z = global_end_z_padded == total_shape.z ? global_shape_padded.z - this_z_padded : radius - padding_back,
+                    leading_zeros = leading_zeros_z * total_shape_padded.y * total_shape_padded.x,
+                    trailing_zeros = trailing_zeros_z * total_shape_padded.y * total_shape_padded.x,
+                    this_block_size = this_z * total_shape_padded.y * total_shape_padded.x,
+                    this_block_size_padded = this_z_padded * total_shape_padded.y * total_shape_padded.x;
 
                 if (DEBUG) {
                     std::cout << global_shape_padded.z << " " << global_shape.z << " " << total_shape.z << std::endl;
-                    std::cout << disk_start_z_padded << " " << disk_start_z << " " << disk_end_z << " " << disk_end_z_padded << std::endl;
+                    std::cout << global_start_z_padded << " " << global_start_z << " " << global_end_z << " " << global_end_z_padded << std::endl;
                     std::cout << leading_zeros_z << " " << padding_front << " " << this_z << " " << padding_back << " " << trailing_zeros_z << std::endl;
                     std::cout << leading_zeros << " " << this_block_size_padded << " " << trailing_zeros << std::endl;
                     std::cout << this_block_size << std::endl;
                     std::cout << "-----" << std::endl;
                 }
 
-                assert (this_block_size <= global_size && "Block size is too large");
+                assert (this_block_size <= global_size_padded && "Block size is too large");
                 assert (leading_zeros_z + padding_front + this_z + padding_back + trailing_zeros_z == global_shape_padded.z && "Block size is incorrect");
                 assert (leading_zeros + this_block_size_padded + trailing_zeros == global_size_padded && "Block size is incorrect");
 
                 // Set the leading 0s of the stage
                 memset(buf0_stage, 0, leading_zeros * sizeof(float));
-                memcpy(buf0_stage + leading_zeros, buf0 + (disk_start_z_padded * total_shape.y * total_shape.x), this_block_size_padded * sizeof(float));
+                memcpy(buf0_stage + leading_zeros, buf0 + (global_start_z_padded * total_shape_padded.y * total_shape_padded.x), this_block_size_padded * sizeof(float));
                 memset(buf0_stage + leading_zeros + this_block_size_padded, 0, trailing_zeros * sizeof(float));
 
+                // TODO yuck
                 memset(mask, 0, leading_zeros * sizeof(uint8_t));
-                memcpy(mask + leading_zeros, voxels + (disk_start_z_padded * total_shape.y * total_shape.x), this_block_size_padded * sizeof(uint8_t));
-                memset(mask + leading_zeros + this_block_size_padded, 0, trailing_zeros * sizeof(uint8_t));
+                memcpy(mask + leading_zeros, voxels + (global_start_z_padded * total_shape.y * total_shape.x), (this_z_padded * total_shape.y * total_shape.x) * sizeof(uint8_t));
+                memset(mask + leading_zeros + (this_z_padded * total_shape.y * total_shape.x), 0, trailing_zeros * sizeof(uint8_t));
 
-                //const uint8_t *this_voxels = voxels + (disk_start_z_padded * total_shape.y * total_shape.x);
-
-                //#pragma acc data copyin(this_voxels[:global_size_padded], buf0_stage[:global_size_padded]) create(buf1_stage[:global_size_padded]) copyout(buf1_stage[:global_size_padded])
-                #pragma acc data copy(mask[:global_size_padded], buf0_stage[:global_size_padded], buf1_stage[:global_size_padded], buf1_stage[:global_size_padded])
+                #pragma acc data copy(mask[:global_shape_padded.z*global_shape.y*global_shape.x], buf0_stage[:global_size_padded], buf1_stage[:global_size_padded])
                 {
-                    for (int64_t dim = 0; dim < 3; dim++) {
-                        diffusion_core(buf0_stage, kernel, buf1_stage, global_shape_padded, dim, radius);
-                        std::swap(buf0_stage, buf1_stage);
-                    }
-                    illuminate(mask, buf0_stage, global_size_padded);
+                    //diffusion_core(buf0_stage, kernel, buf1_stage, global_shape_padded, 0, radius);
+                    //diffusion_core(buf1_stage, kernel, buf0_stage, global_shape_padded, 1, radius);
+                    //diffusion_core(buf0_stage, kernel, buf1_stage, global_shape_padded, 2, radius);
+                    diffusion_core_z(buf0_stage, kernel, buf1_stage, global_shape_padded, radius);
+                    diffusion_core_y(buf1_stage, kernel, buf0_stage, global_shape_padded, radius);
+                    diffusion_core_x(buf0_stage, kernel, buf1_stage, global_shape_padded, radius);
+                    std::swap(buf0_stage, buf1_stage);
+                    illuminate(mask, buf0_stage, {global_shape_padded.z, global_shape.y, global_shape.x}, global_shape_padded);
                 }
 
                 // Copy the result back
-                memcpy(buf1 + (disk_start_z * total_shape.y * total_shape.x), buf0_stage + leading_zeros + (padding_front*total_shape.y*total_shape.x), this_block_size * sizeof(float));
+                memcpy(buf1 + (global_start_z * total_shape_padded.y * total_shape_padded.x), buf0_stage + leading_zeros + (padding_front*total_shape_padded.y*total_shape_padded.x), this_block_size * sizeof(float));
             }
             std::swap(buf0, buf1);
         }
@@ -816,14 +828,19 @@ namespace gpu {
         std::cout << "\rDiffusion is complete!" << std::endl;
 
         for (int64_t start_z = 0; start_z < total_shape.z; start_z += global_shape.z) {
+            const int64_t end_z = std::min(total_shape.z, start_z + global_shape.z);
+            const shape_t
+                this_shape = {end_z - start_z, global_shape.y, global_shape.x},
+                this_shape_padded = {end_z - start_z, global_shape_padded.y, global_shape_padded.x};
             const int64_t
-                end_z = std::min(total_shape.z, start_z + global_shape.z),
-                this_block_size = (end_z - start_z) * total_shape.y * total_shape.x;
+                this_block_size = this_shape.z * this_shape.y * this_shape.x,
+                this_block_size_padded = this_shape_padded.z * this_shape_padded.y * this_shape_padded.x;
+            const float *this_buf0 = buf0 + (start_z * total_shape_padded.y * total_shape_padded.x);
             uint16_t *this_output = output + (start_z * total_shape.y * total_shape.x);
-            float *this_buf0 = buf0 + (start_z * total_shape.y * total_shape.x);
-            #pragma acc data copyin(this_buf0[:this_block_size]) create(this_output[:this_block_size]) copyout(this_output[:this_block_size])
+
+            #pragma acc data copyin(this_buf0[:this_block_size_padded]) create(this_output[:this_block_size]) copyout(this_output[:this_block_size])
             {
-                convert_float_to_uint16(this_buf0, this_output, this_block_size);
+                convert_float_to_uint16(this_buf0, this_output, this_shape, this_shape_padded);
             }
         }