From 9d27dcfb0ffbcf92b324230c92fc83e4270c2977 Mon Sep 17 00:00:00 2001
From: Carl Johnsen <carl-johannes@di.ku.dk>
Date: Tue, 10 Sep 2024 15:25:24 +0200
Subject: [PATCH] #5 Async/multi-device WIP: translating the out of core
 implementation into one that decomposes all axes, rather than just z.

---
 src/lib/cpp/gpu/diffusion.cc   | 336 +++++++++++++++++----------------
 src/pybind/diffusion-pybind.cc |   5 +-
 2 files changed, 174 insertions(+), 167 deletions(-)
diff --git a/src/lib/cpp/gpu/diffusion.cc b/src/lib/cpp/gpu/diffusion.cc
index 880f999..5d954c1 100644
--- a/src/lib/cpp/gpu/diffusion.cc
+++ b/src/lib/cpp/gpu/diffusion.cc
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <filesystem>
 #include "openacc.h"
+#include "omp.h"
 
 constexpr bool
     DEBUG = false,
@@ -11,7 +12,7 @@ constexpr bool
 
 namespace gpu {
 
-    void diffusion_core(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t dim, const int32_t radius) {
+    void diffusion_core(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t dim, const int32_t radius, const int stream = 0) {
         // Note: the use of 32-bit is intentional to reduce register pressure on GPU. Each of the 32-bit values shouldn't exceed 2^32, but the indices (address) to the arrays can be.
 
         const int32_t
@@ -19,7 +20,7 @@ namespace gpu {
             stride = strides[dim],
             Ns[3] = {N.z, N.y, N.x};
 
-        #pragma acc parallel loop collapse(3) present(input, kernel, output)
+        #pragma acc parallel loop collapse(3) present(input, kernel, output) async(stream)
         for (int32_t i = 0; i < N.z; i++) {
             for (int32_t j = 0; j < N.y; j++) {
                 for (int32_t k = 0; k < N.x; k++) {
@@ -118,7 +119,7 @@ namespace gpu {
     }
 
     // TODO Make this call y, since they're VERY similar?
-    void diffusion_core_z(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) {
+    void diffusion_core_z(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius, const int stream = 0) {
         // Assumes that the x dimension is a multiple of veclen.
         constexpr int32_t
             worklen = 1,
@@ -128,7 +129,7 @@ namespace gpu {
         const int32_t
             kernel_size = 2*radius+1,
             nz = N.z, ny = N.y, nx = N.x;
-        #pragma acc parallel vector_length(veclen) num_workers(worklen) present(input, kernel, output)
+        #pragma acc parallel vector_length(veclen) num_workers(worklen) present(input, kernel, output) async(stream)
         {
             #pragma acc loop gang collapse(2)
             for (int32_t y = 0; y < ny; y++) {
@@ -171,7 +172,7 @@ namespace gpu {
         }
     }
 
-    void diffusion_core_y(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) {
+    void diffusion_core_y(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius, const int stream = 0) {
         // Assumes that the x dimension is a multiple of veclen.
         constexpr int32_t
             worklen = 1,
@@ -181,7 +182,7 @@ namespace gpu {
         const int32_t
             kernel_size = 2*radius+1,
             nz = N.z, ny = N.y, nx = N.x;
-        #pragma acc parallel vector_length(veclen) num_workers(worklen) present(input, kernel, output)
+        #pragma acc parallel vector_length(veclen) num_workers(worklen) present(input, kernel, output) async(stream)
         {
             #pragma acc loop gang collapse(2)
             for (int32_t z = 0; z < nz; z++) {
@@ -224,12 +225,12 @@ namespace gpu {
         }
     }
 
-    void diffusion_core_x(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius) {
+    void diffusion_core_x(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int32_t radius, const int stream) {
         // Note: the use of 32-bit is intentional to reduce register pressure on GPU. Each of the 32-bit values shouldn't exceed 2^32, but the indices (address) to the arrays can be.
         constexpr int32_t veclen = 32;
         float local[3*veclen], local_kernel[veclen]; // Local memory.
         const int32_t nz = N.z, ny = N.y, nx = N.x;
-        #pragma acc parallel vector_length(veclen) num_workers(1) present(input, kernel, output)
+        #pragma acc parallel vector_length(veclen) num_workers(1) present(input, kernel, output) async(stream)
         {
             #pragma acc loop gang collapse(2)
             for (int32_t z = 0; z < nz; z++) {
@@ -510,8 +511,8 @@ namespace gpu {
         fclose(file_src);
     }
 
-    void illuminate(const uint8_t *__restrict__ mask, float *__restrict__ output, const int64_t local_flat_size) {
-        #pragma acc parallel loop present(mask, output)
+    void illuminate(const uint8_t *__restrict__ mask, float *__restrict__ output, const int64_t local_flat_size, const int stream = 0) {
+        #pragma acc parallel loop present(mask, output) async(stream)
         for (int64_t thread = 0; thread < gpu_threads; thread++) {
             const int64_t
                 chunk_size = local_flat_size / gpu_threads,
@@ -523,8 +524,8 @@ namespace gpu {
         }
     }
 
-    void illuminate(const uint8_t *__restrict__ mask, float *__restrict__ output, const shape_t &N, const shape_t &P) {
-        #pragma acc parallel loop collapse(3) present(mask, output)
+    void illuminate(const uint8_t *__restrict__ mask, float *__restrict__ output, const shape_t &N, const shape_t &P, const int stream = 0) {
+        #pragma acc parallel loop collapse(3) present(mask, output) async(stream)
         for (int32_t z = 0; z < N.z; z++) {
             for (int32_t y = 0; y < N.y; y++) {
                 for (int32_t x = 0; x < N.x; x++) {
@@ -537,19 +538,19 @@ namespace gpu {
         }
     }
 
-    void diffusion_step(const uint8_t *__restrict__ voxels, float *buf0, float *buf1, const shape_t &N, const shape_t &P, const float *__restrict__ kernel, const int64_t radius) {
+    void diffusion_step(const uint8_t *__restrict__ voxels, float *buf0, float *buf1, const shape_t &N, const shape_t &P, const float *__restrict__ kernel, const int64_t radius, const int stream = 0) {
         if (radius < 16) {
-            diffusion_core_z(buf0, kernel, buf1, P, radius);
-            diffusion_core_y(buf1, kernel, buf0, P, radius);
-            diffusion_core_x(buf0, kernel, buf1, P, radius);
+            diffusion_core_z(buf0, kernel, buf1, P, radius, stream);
+            diffusion_core_y(buf1, kernel, buf0, P, radius, stream);
+            diffusion_core_x(buf0, kernel, buf1, P, radius, stream);
         } else {
-            diffusion_core(buf0, kernel, buf1, P, 0, radius);
-            diffusion_core(buf1, kernel, buf0, P, 1, radius);
-            diffusion_core(buf0, kernel, buf1, P, 2, radius);
+            diffusion_core(buf0, kernel, buf1, P, 0, radius, stream);
+            diffusion_core(buf1, kernel, buf0, P, 1, radius, stream);
+            diffusion_core(buf0, kernel, buf1, P, 2, radius, stream);
         }
         std::swap(buf0, buf1);
 
-        illuminate(voxels, buf0, N, P);
+        illuminate(voxels, buf0, N, P, stream);
     }
 
     void store_mask(const float *__restrict__ input, uint8_t *__restrict__ mask, const int64_t local_flat_size) {
@@ -729,169 +730,174 @@ namespace gpu {
         fclose(tmp1);
     }
 
+    // The full data sample resides in main memory, but out of GPU memory.
+    // Assumes that each shape in global_shape is > kernel_size in order to contain the overlap. This assumption allows for controlling the memory used by each thread / async operation outside of this call.
     void diffusion_out_of_core(uint8_t *__restrict__ voxels, const shape_t &total_shape, const shape_t &global_shape, const float *__restrict__ kernel, const int64_t kernel_size, const int64_t repititions, uint16_t *__restrict__ output) {
-        constexpr int32_t veclen = 32; // TODO
+        // TODO should be a configuration parameter set somewhere global / statically during configuration/compilation.
+        constexpr int32_t
+            veclen = 32,
+            #pragma diag_suppress 177
+            n_devices = 1,
+            n_streams = 1;
+        const int64_t radius = kernel_size / 2;
         const shape_t
-            total_shape_padded = {total_shape.z, total_shape.y, (total_shape.x + veclen - 1) / veclen * veclen},
-            global_shape_padded = {global_shape.z+kernel_size-1, global_shape.y, (global_shape.x + veclen - 1) / veclen * veclen};
+            //total_shape_padded = {
+            //    ((total_shape.z + veclen - 1) / veclen) * veclen,
+            //    ((total_shape.y + veclen - 1) / veclen) * veclen,
+            //    ((total_shape.x + veclen - 1) / veclen) * veclen
+            //},
+            // TODO assume that global_shape is also aligned to veclen?
+            global_shape_padded = {
+                (((global_shape.z + 2*radius) + veclen - 1) / veclen) * veclen,
+                (((global_shape.y + 2*radius) + veclen - 1) / veclen) * veclen,
+                (((global_shape.x + 2*radius) + veclen - 1) / veclen) * veclen
+            },
+            blocks_shape = {
+                (total_shape.z + global_shape.z - 1) / global_shape.z,
+                (total_shape.y + global_shape.y - 1) / global_shape.y,
+                (total_shape.x + global_shape.x - 1) / global_shape.x
+            };
         const int64_t
-            //total_size = total_shape.z*total_shape.y*total_shape.x,
-            total_size_padded = total_shape_padded.z*total_shape_padded.y*total_shape_padded.x,
-            global_size = global_shape.z*global_shape.y*global_shape.x,
-            global_size_padded = global_shape_padded.z*global_shape_padded.y*global_shape_padded.x,
-            radius = kernel_size / 2,
-            blocks = (total_shape.z + global_shape.z - 1) / global_shape.z;
-
-        float *buf0 = new float[total_size_padded];
-        float *buf1 = new float[total_size_padded];
-        float *buf0_stage = new float[global_size_padded];
-        float *buf1_stage = new float[global_size_padded];
-        uint8_t *mask = new uint8_t[global_size_padded];
-
-        const int32_t n_streams = 3; // read, compute, write
-        uint8_t *voxels_ds[n_streams];
-        float *buf0_ds[n_streams];
-        for (int32_t i = 0; i < n_streams; i++) {
-            voxels_ds[i] = (uint8_t *) acc_malloc(global_size * sizeof(uint8_t));
-            buf0_ds[i] = (float *) acc_malloc(global_size_padded * sizeof(float));
+            total_size = total_shape.z*total_shape.y*total_shape.x,
+            //total_size_padded = total_shape_padded.z*total_shape_padded.y*total_shape_padded.x,
+            //global_size = global_shape.z*global_shape.y*global_shape.x,
+            //blocks = (total_shape.z + global_shape.z - 1) / global_shape.z,
+            global_size_padded = global_shape_padded.z*global_shape_padded.y*global_shape_padded.x;
+
+        float *buf0 = (float *) malloc(total_size * sizeof(float));
+        float *buf1 = (float *) malloc(total_size * sizeof(float));
+
+        // Should be faster on CPU, compared to GPU, since the data has to come back to the CPU anyways due to the size. For in-memory, the GPU version is faster, since the transfer back can be avoided.
+        // This should be tested to confirm.
+        #pragma omp parallel for collapse(3) schedule(static)
+        for (int32_t z = 0; z < total_shape.z; z++) {
+            for (int32_t y = 0; y < total_shape.y; y++) {
+                for (int32_t x = 0; x < total_shape.x; x++) {
+                    bool
+                        z_valid = z < total_shape.z,
+                        y_valid = y < total_shape.y,
+                        x_valid = x < total_shape.x;
+                    const int64_t
+                        src_index = (int64_t)z*(int64_t)total_shape.y*(int64_t)total_shape.x + (int64_t)y*(int64_t)total_shape.x + (int64_t)x,
+                        dst_index = (int64_t)z*(int64_t)total_shape.y*(int64_t)total_shape.x + (int64_t)y*(int64_t)total_shape.x + (int64_t)x;
+                    buf0[dst_index] = z_valid && y_valid && x_valid && voxels[src_index] ? 1.0f : 0.0f;
+                }
+            }
         }
 
-        for (int64_t start_z = 0; start_z < total_shape.z; start_z += global_shape.z) {
-            const int64_t
-                end_z = std::min(total_shape.z, start_z + global_shape.z),
-                this_z = end_z - start_z;
-            const shape_t
-                this_shape = {this_z, global_shape.y, global_shape.x},
-                this_shape_padded = {this_z, global_shape_padded.y, global_shape_padded.x};
-            const int64_t
-                this_block_size = this_shape.z * this_shape.y * this_shape.x,
-                this_block_size_padded = this_shape_padded.z * this_shape_padded.y * this_shape_padded.x;
-            uint8_t *this_voxels = voxels + (start_z * total_shape.y * total_shape.x);
-            float *this_buf0 = buf0 + (start_z * total_shape_padded.y * total_shape_padded.x);
-            const int32_t stream = ((start_z / global_shape.z) % n_streams);
-            uint8_t *voxels_d = voxels_ds[stream];
-            float *buf0_d = buf0_ds[stream];
-            #pragma acc declare deviceptr(voxels_d, buf0_d)
-            {
-                acc_memcpy_to_device_async(voxels_d, this_voxels, this_block_size * sizeof(uint8_t), stream);
-                // TODO for some reason, it fails when called as a function. Pasted function content works.
-                //convert_uint8_to_float(voxels_d, buf0_d, this_shape, this_shape_padded, stream);
-
-                #pragma acc parallel vector_length(128) num_workers(1) present(voxels_d, buf0_d) async(stream)
-                {
-                    #pragma acc loop gang collapse(2)
-                    for (int32_t z = 0; z < this_shape_padded.z; z++) {
-                        for (int32_t y = 0; y < this_shape_padded.y; y++) {
-                            #pragma acc loop vector
-                            for (int32_t x = 0; x < this_shape_padded.x; x++) {
-                                const bool
-                                    valid_z = z < this_shape.z,
-                                    valid_y = y < this_shape.y,
-                                    valid_x = x < this_shape.x;
-                                const int64_t
-                                    src_index = (int64_t)z*(int64_t)this_shape.y*(int64_t)this_shape.x + (int64_t)y*(int64_t)this_shape.x + (int64_t)x,
-                                    dst_index = (int64_t)z*(int64_t)this_shape_padded.y*(int64_t)this_shape_padded.x + (int64_t)y*(int64_t)this_shape_padded.x + (int64_t)x;
-                                buf0_d[dst_index] = valid_z && valid_y && valid_x ? (float) voxels_d[src_index] : 0.0f;
+        #pragma omp parallel num_threads(n_devices * n_streams)
+        {
+            int32_t
+                tid = omp_get_thread_num(),
+                __attribute__((unused)) device = tid / n_streams,
+                stream = tid % n_streams;
+            acc_set_device_num(device, acc_device_nvidia);
+
+            uint8_t
+                // Staging area on host
+                *voxels_h = (uint8_t *) malloc(global_size_padded * sizeof(uint8_t)),
+                // Device memory
+                *voxels_d = (uint8_t *) acc_malloc(global_size_padded * sizeof(uint8_t));
+            float
+                // Staging area on host
+                *buf0_h = (float *) malloc(global_size_padded * sizeof(float)),
+                *buf1_h = (float *) malloc(global_size_padded * sizeof(float)),
+                // Device memory
+                *buf0_d = (float *) acc_malloc(global_size_padded * sizeof(float)),
+                *buf1_d = (float *) acc_malloc(global_size_padded * sizeof(float));
+
+            acc_map_data(voxels_d, voxels_h, global_size_padded * sizeof(uint8_t));
+            acc_map_data(buf0_d, buf0_h, global_size_padded * sizeof(float));
+            acc_map_data(buf1_d, buf1_h, global_size_padded * sizeof(float));
+
+            for (int64_t rep = 0; rep < repititions; rep++) {
+                #pragma omp for collapse(3) schedule(dynamic)
+                for (int32_t bz = 0; bz < blocks_shape.z; bz++) {
+                    for (int32_t by = 0; by < blocks_shape.y; by++) {
+                        for (int32_t bx = 0; bx < blocks_shape.x; bx++) {
+                            const int64_t
+                                start_z = bz * global_shape.z,
+                                start_y = by * global_shape.y,
+                                start_x = bx * global_shape.x,
+                                end_z = std::min(total_shape.z, start_z + global_shape.z),
+                                end_y = std::min(total_shape.y, start_y + global_shape.y),
+                                end_x = std::min(total_shape.x, start_x + global_shape.x);
+
+                            // Copy the data to the staging area
+                            for (int32_t z = 0; z < global_shape_padded.z; z++) {
+                                for (int32_t y = 0; y < global_shape_padded.y; y++) {
+                                    for (int32_t x = 0; x < global_shape_padded.x; x++) {
+                                        const int64_t
+                                            offsetted_z = start_z + z - radius,
+                                            offsetted_y = start_y + y - radius,
+                                            offsetted_x = start_x + x - radius,
+                                            src_index = (int64_t)offsetted_z*(int64_t)total_shape.y*(int64_t)total_shape.x + (int64_t)offsetted_y*(int64_t)total_shape.x + (int64_t)offsetted_x,
+                                            dst_index = (int64_t)z*(int64_t)global_shape_padded.y*(int64_t)global_shape_padded.x + (int64_t)y*(int64_t)global_shape_padded.x + (int64_t)x;
+                                        const bool
+                                            valid_z = offsetted_z >= 0 && offsetted_z < total_shape.z,
+                                            valid_y = offsetted_y >= 0 && offsetted_y < total_shape.y,
+                                            valid_x = offsetted_x >= 0 && offsetted_x < total_shape.x;
+                                        voxels_h[dst_index] = valid_z && valid_y && valid_x ? voxels[src_index] : 0;
+                                        buf0_h[dst_index] = valid_z && valid_y && valid_x ? buf0[src_index] : 0;
+                                    }
+                                }
+                            }
+
+                            #pragma acc declare deviceptr(voxels_d, buf0_d, buf1_d)
+                            {
+                                acc_memcpy_to_device_async(voxels_d, voxels_h, global_size_padded * sizeof(uint8_t), stream);
+                                acc_memcpy_to_device_async(buf0_d, buf0_h, global_size_padded * sizeof(float), stream);
+                                acc_wait(stream);
+                                diffusion_step(voxels_d, buf0_d, buf1_d, global_shape_padded, global_shape_padded, kernel, radius, stream);
+                                acc_memcpy_from_device_async(buf1_h, buf1_d, global_size_padded * sizeof(float), stream);
+                                acc_wait(stream);
+                            }
+
+                            // Copy the data back
+                            for (int32_t z = 0; z < global_shape.z; z++) {
+                                for (int32_t y = 0; y < global_shape.y; y++) {
+                                    for (int32_t x = 0; x < global_shape.x; x++) {
+                                        const int64_t
+                                            src_index = ((int64_t)z+radius)*(int64_t)global_shape_padded.y*(int64_t)global_shape_padded.x + ((int64_t)y+radius)*(int64_t)global_shape_padded.x + (int64_t)x+radius,
+                                            dst_index = ((int64_t)start_z+(int64_t)z)*(int64_t)total_shape.y*(int64_t)total_shape.x + ((int64_t)start_y+(int64_t)y)*(int64_t)total_shape.x + (int64_t)start_x+(int64_t)x;
+                                        buf1[dst_index] = buf1_h[src_index];
+                                    }
+                                }
                             }
                         }
                     }
                 }
-
-                acc_memcpy_from_device_async(this_buf0, buf0_d, this_block_size_padded * sizeof(float), stream);
+                std::swap(buf0, buf1);
             }
-        }
-        #pragma acc wait
-        const std::string debug_newline = DEBUG ? "\n" : "";
-        for (int32_t i = 0; i < n_streams; i++) {
-            acc_free(voxels_ds[i]);
-            acc_free(buf0_ds[i]);
-        }
 
-        #pragma acc data copyin(kernel[:kernel_size]) // TODO pad kernel?
-        {
-        for (int64_t rep = 0; rep < repititions; rep++) {
-            for (int64_t block = 0; block < blocks; block++) {
-                std::cout << "\rDiffusion: " << rep*blocks + block << "/" << repititions*blocks << debug_newline << std::flush;
-
-                const int64_t
-                    global_start_z = block * global_shape.z,
-                    global_end_z = std::min(total_shape.z, global_start_z + global_shape.z),
-                    padding_front = std::min((int64_t) radius, global_start_z),
-                    padding_back = std::min((int64_t) radius, total_shape.z - global_end_z),
-                    global_start_z_padded = global_start_z - padding_front,
-                    global_end_z_padded = global_end_z + padding_back,
-                    this_z = global_end_z - global_start_z,
-                    this_z_padded = global_end_z_padded - global_start_z_padded,
-                    leading_zeros_z = radius - padding_front,
-                    trailing_zeros_z = global_end_z_padded == total_shape.z ? global_shape_padded.z - this_z_padded : radius - padding_back,
-                    leading_zeros = leading_zeros_z * total_shape_padded.y * total_shape_padded.x,
-                    trailing_zeros = trailing_zeros_z * total_shape_padded.y * total_shape_padded.x,
-                    this_block_size = this_z * total_shape_padded.y * total_shape_padded.x,
-                    this_block_size_padded = this_z_padded * total_shape_padded.y * total_shape_padded.x;
-
-                if (DEBUG) {
-                    std::cout << global_shape_padded.z << " " << global_shape.z << " " << total_shape.z << std::endl;
-                    std::cout << global_start_z_padded << " " << global_start_z << " " << global_end_z << " " << global_end_z_padded << std::endl;
-                    std::cout << leading_zeros_z << " " << padding_front << " " << this_z << " " << padding_back << " " << trailing_zeros_z << std::endl;
-                    std::cout << leading_zeros << " " << this_block_size_padded << " " << trailing_zeros << std::endl;
-                    std::cout << this_block_size << std::endl;
-                    std::cout << "-----" << std::endl;
-                }
+            acc_unmap_data(voxels_d);
+            acc_unmap_data(buf0_d);
+            acc_unmap_data(buf1_d);
 
-                assert (this_block_size <= global_size_padded && "Block size is too large");
-                assert (leading_zeros_z + padding_front + this_z + padding_back + trailing_zeros_z == global_shape_padded.z && "Block size is incorrect");
-                assert (leading_zeros + this_block_size_padded + trailing_zeros == global_size_padded && "Block size is incorrect");
-
-                // Set the leading 0s of the stage
-                memset(buf0_stage, 0, leading_zeros * sizeof(float));
-                memcpy(buf0_stage + leading_zeros, buf0 + (global_start_z_padded * total_shape_padded.y * total_shape_padded.x), this_block_size_padded * sizeof(float));
-                memset(buf0_stage + leading_zeros + this_block_size_padded, 0, trailing_zeros * sizeof(float));
-
-                // TODO yuck
-                memset(mask, 0, leading_zeros * sizeof(uint8_t));
-                memcpy(mask + leading_zeros, voxels + (global_start_z_padded * total_shape.y * total_shape.x), (this_z_padded * total_shape.y * total_shape.x) * sizeof(uint8_t));
-                memset(mask + leading_zeros + (this_z_padded * total_shape.y * total_shape.x), 0, trailing_zeros * sizeof(uint8_t));
-
-                #pragma acc data copy(mask[:global_shape_padded.z*global_shape.y*global_shape.x], buf0_stage[:global_size_padded], buf1_stage[:global_size_padded])
-                {
-                    //diffusion_core(buf0_stage, kernel, buf1_stage, global_shape_padded, 0, radius);
-                    //diffusion_core(buf1_stage, kernel, buf0_stage, global_shape_padded, 1, radius);
-                    //diffusion_core(buf0_stage, kernel, buf1_stage, global_shape_padded, 2, radius);
-                    diffusion_core_z(buf0_stage, kernel, buf1_stage, global_shape_padded, radius);
-                    diffusion_core_y(buf1_stage, kernel, buf0_stage, global_shape_padded, radius);
-                    diffusion_core_x(buf0_stage, kernel, buf1_stage, global_shape_padded, radius);
-                    illuminate(mask, buf1_stage, {global_shape_padded.z, global_shape.y, global_shape.x}, global_shape_padded);
-                }
+            free(voxels_h);
+            free(buf0_h);
+            free(buf1_h);
 
-                // Copy the result back
-                memcpy(buf1 + (global_start_z * total_shape_padded.y * total_shape_padded.x), buf1_stage + leading_zeros + (padding_front*total_shape_padded.y*total_shape_padded.x), this_block_size * sizeof(float));
-            }
-            std::swap(buf0, buf1);
-        }
+            acc_free(voxels_d);
+            acc_free(buf0_d);
+            acc_free(buf1_d);
         }
-        std::cout << "\rDiffusion is complete!" << std::endl;
 
-        for (int64_t start_z = 0; start_z < total_shape.z; start_z += global_shape.z) {
-            const int64_t end_z = std::min(total_shape.z, start_z + global_shape.z);
-            const shape_t
-                this_shape = {end_z - start_z, global_shape.y, global_shape.x},
-                this_shape_padded = {end_z - start_z, global_shape_padded.y, global_shape_padded.x};
-            const int64_t
-                this_block_size = this_shape.z * this_shape.y * this_shape.x,
-                this_block_size_padded = this_shape_padded.z * this_shape_padded.y * this_shape_padded.x;
-            const float *this_buf0 = buf0 + (start_z * total_shape_padded.y * total_shape_padded.x);
-            uint16_t *this_output = output + (start_z * total_shape.y * total_shape.x);
-
-            #pragma acc data copyin(this_buf0[:this_block_size_padded]) create(this_output[:this_block_size]) copyout(this_output[:this_block_size])
-            {
-                convert_float_to_uint16(this_buf0, this_output, this_shape, this_shape_padded);
+        // Same argument as above - should be faster for CPU.
+        #pragma omp parallel for collapse(3) schedule(static)
+        for (int32_t z = 0; z < total_shape.z; z++) {
+            for (int32_t y = 0; y < total_shape.y; y++) {
+                for (int32_t x = 0; x < total_shape.x; x++) {
+                    const int64_t
+                        src_index = (int64_t)z*(int64_t)total_shape.y*(int64_t)total_shape.x + (int64_t)y*(int64_t)total_shape.x + (int64_t)x,
+                        dst_index = (int64_t)z*(int64_t)total_shape.y*(int64_t)total_shape.x + (int64_t)y*(int64_t)total_shape.x + (int64_t)x;
+                    output[dst_index] = (uint16_t) std::floor(buf0[src_index] * 65535.0f);
+                }
             }
         }
 
-        delete[] buf0;
-        delete[] buf1;
-        delete[] buf0_stage;
-        delete[] buf1_stage;
+        free(buf0);
+        free(buf1);
     }
 
 }
\ No newline at end of file
diff --git a/src/pybind/diffusion-pybind.cc b/src/pybind/diffusion-pybind.cc
index 5ce57bd..a15b8e4 100644
--- a/src/pybind/diffusion-pybind.cc
+++ b/src/pybind/diffusion-pybind.cc
@@ -17,8 +17,9 @@ namespace python_api {
 
 #ifdef _OPENACC
         const int64_t total_size = N.z * N.y * N.x;
-        if (total_size * sizeof(float) * 2 > 8 * 1e9) { // TODO make automatic
-            const shape_t global_shape = {32, N.y, N.x};
+        //if (total_size * (sizeof(uint8_t) + (2 * sizeof(float)) + sizeof(uint16_t)) > (1 * 1e9)) { // TODO make automatic
+        if (true) {
+            const shape_t global_shape = {64, 64, 64};
             NS::diffusion_out_of_core(voxels, N, global_shape, kernel, kernel_size, repititions, output);
         } else {
 #endif