Skip to content

Commit

Permalink
#5 Made diffusion_core_y faster by increasing number of blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
carljohnsen committed Jul 29, 2024
1 parent b76c123 commit 2a586b2
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/lib/cpp/gpu/diffusion.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,17 @@ namespace gpu {
// Assumes that the x dimension is a multiple of veclen.
constexpr int32_t
worklen = 1,
veclen = 256,
veclen = 64,
max_k = 32,
sqvec = max_k*veclen;
const int32_t
kernel_size = 2*radius+1,
nz = N.z, ny = N.y, nx = N.x;
#pragma acc parallel vector_length(veclen) num_workers(worklen) present(input, kernel, output)
{
#pragma acc loop gang
#pragma acc loop gang collapse(2)
for (int32_t z = 0; z < nz; z++) {
#pragma acc loop worker
//#pragma acc loop worker
for (int32_t x = 0; x < nx; x += veclen) {
float local[sqvec], local_kernel[max_k]; // Local memory.
#pragma acc cache(local_kernel, local)
Expand Down

0 comments on commit 2a586b2

Please sign in to comment.