Skip to content

Commit

Permalink
#5 Doing pragma acc parallel loop on large loops creates memory viola…
Browse files Browse the repository at this point in the history
…tions.

Instead, parallelize over chunks of num_threads
  • Loading branch information
carljohnsen committed Apr 10, 2024
1 parent 230359a commit 101f049
Showing 1 changed file with 9 additions and 7 deletions.
16 changes: 9 additions & 7 deletions src/lib/cpp/gpu/diffusion.cc
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,10 @@ namespace gpu {
}

void illuminate(const uint8_t *__restrict__ mask, float *__restrict__ output, const int64_t local_flat_size) {
#pragma acc parallel loop
for (int64_t i = 0; i < local_flat_size; i++) {
if (mask[i] > 0) {
output[i] = 1.0f;
#pragma acc parallel loop present(mask, output)
for (int64_t thread = 0; thread < gpu_threads; thread++) {
for (int64_t i = thread; i < local_flat_size; i += gpu_threads) {
output[i] = mask[i] ? 1.0f : output[i];
}
}
}
Expand All @@ -239,9 +239,11 @@ namespace gpu {
}

void store_mask(const float *__restrict__ input, uint8_t *__restrict__ mask, const int64_t local_flat_size) {
#pragma acc parallel loop
for (int64_t i = 0; i < local_flat_size; i++) {
mask[i] = input[i] == 1.0f ? 1 : 0; // The implant will always be 1.0f
#pragma acc parallel loop present(input, mask)
for (int64_t thread = 0; thread < gpu_threads; thread++) {
for (int64_t i = thread; i < local_flat_size; i += gpu_threads) {
mask[i] = (input[i] == 1.0f); // The implant will always be 1.0f
}
}
}

Expand Down

0 comments on commit 101f049

Please sign in to comment.