#5 Doing pragma acc parallel loop on large loops creates memory viola…

…tions. Instead, parallelize over chunks of num_threads
jamesavery · Apr 10, 2024 · 101f049 · 101f049
1 parent 230359a
commit 101f049
Showing 1 changed file with 9 additions and 7 deletions.
diff --git a/src/lib/cpp/gpu/diffusion.cc b/src/lib/cpp/gpu/diffusion.cc
@@ -222,10 +222,10 @@ namespace gpu {
     }
 
     void illuminate(const uint8_t *__restrict__ mask, float *__restrict__ output, const int64_t local_flat_size) {
-        #pragma acc parallel loop
-        for (int64_t i = 0; i < local_flat_size; i++) {
-            if (mask[i] > 0) {
-                output[i] = 1.0f;
+        #pragma acc parallel loop present(mask, output)
+        for (int64_t thread = 0; thread < gpu_threads; thread++) {
+            for (int64_t i = thread; i < local_flat_size; i += gpu_threads) {
+                output[i] = mask[i] ? 1.0f : output[i];
             }
         }
     }
@@ -239,9 +239,11 @@ namespace gpu {
     }
 
     void store_mask(const float *__restrict__ input, uint8_t *__restrict__ mask, const int64_t local_flat_size) {
-        #pragma acc parallel loop
-        for (int64_t i = 0; i < local_flat_size; i++) {
-            mask[i] = input[i] == 1.0f ? 1 : 0; // The implant will always be 1.0f
+        #pragma acc parallel loop present(input, mask)
+        for (int64_t thread = 0; thread < gpu_threads; thread++) {
+            for (int64_t i = thread; i < local_flat_size; i += gpu_threads) {
+                mask[i] = (input[i] == 1.0f); // The implant will always be 1.0f
+            }
         }
     }