From be0993eaa8bd82e3bd44bafe21fdffde52d2b6a1 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 27 Sep 2023 19:09:53 +0200
Subject: [PATCH 01/37] add halo width parameter

---
 src/init_global_grid.jl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index cc77591..102d3f4 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -12,7 +12,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
 - {`periodx`|`periody`|`periodz`}`::Integer=0`: whether the grid is periodic (`1`) or not (`0`) in dimension {x|y|z}.
 - `quiet::Bool=false`: whether to suppress printing information like the size of the global grid (`true`) or not (`false`).
 !!! note "Advanced keyword arguments"
-    - {`overlapx`|`overlapy`|`overlapz`}`::Integer=2`: the number of elements adjacent local grids overlap in dimension {x|y|z}. By default (value `2`), an array `A` of size (`nx`, `ny`, `nz`) on process 1 (`A_1`) overlaps the corresponding array `A` on process 2 (`A_2`) by `2` indices if the two processes are adjacent. E.g., if `overlapx=2` and process 2 is the right neighbor of process 1 in dimension x, then `A_1[end-1:end,:,:]` overlaps `A_2[1:2,:,:]`. That means, after every call `update_halo!(A)`, we have `all(A_1[end-1:end,:,:] .== A_2[1:2,:,:])` (`A_1[end,:,:]` is the halo of process 1 and `A_2[1,:,:]` is the halo of process 2). The analog applies for the dimensions y and z.
+    - `overlaps::Tuple{Int,Int,Int}=(2,2,2)`: the number of elements adjacent local grids overlap in dimension x, y and z. By default (value `(2,2,2)`), an array `A` of size (`nx`, `ny`, `nz`) on process 1 (`A_1`) overlaps the corresponding array `A` on process 2 (`A_2`) by `2` indices if the two processes are adjacent. E.g., if `overlaps[1]=2` and process 2 is the right neighbor of process 1 in dimension x, then `A_1[end-1:end,:,:]` overlaps `A_2[1:2,:,:]`. That means, after every call `update_halo!(A)`, we have `all(A_1[end-1:end,:,:] .== A_2[1:2,:,:])` (`A_1[end,:,:]` is the halo of process 1 and `A_2[1,:,:]` is the halo of process 2). The analog applies for the dimensions y and z.
+    - `halowidths::Tuple{Int,Int,Int}=overlaps.÷2`: the default width of an array's halo in dimension x, y and z. The default can be overwritten per array in the function [`update_halo`](@ref).
     - `disp::Integer=1`:  the displacement argument to `MPI.Cart_shift` in order to determine the neighbors.
     - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology.
     - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology.
@@ -37,12 +38,13 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
 
 See also: [`finalize_global_grid`](@ref), [`select_device`](@ref)
 """
-function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlapx::Integer=2, overlapy::Integer=2, overlapz::Integer=2, disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
+function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=overlaps.÷2, disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
     if grid_is_initialized() error("The global grid has already been initialized.") end
     nxyz              = [nx, ny, nz];
     dims              = [dimx, dimy, dimz];
     periods           = [periodx, periody, periodz];
-    overlaps          = [overlapx, overlapy, overlapz];
+    overlaps          = [overlaps...];
+    halowidths        = [halowidths...];
     cuda_enabled      = false
     amdgpu_enabled    = false
     cudaaware_MPI     = [false, false, false]
@@ -73,7 +75,8 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
     if (nx==1) error("Invalid arguments: nx can never be 1.") end
     if (ny==1 && nz>1) error("Invalid arguments: ny cannot be 1 if nz is greater than 1.") end
     if (any((nxyz .== 1) .& (dims .>1 ))) error("Incoherent arguments: if nx, ny, or nz is 1, then the corresponding dimx, dimy or dimz must not be set (or set 0 or 1)."); end
-    if (any((nxyz .< 2 .* overlaps .- 1) .& (periods .> 0))) error("Incoherent arguments: if nx, ny, or nz is smaller than 2*overlapx-1, 2*overlapy-1 or 2*overlapz-1, respectively, then the corresponding periodx, periody or periodz must not be set (or set 0)."); end
+    if (any((nxyz .< 2 .* overlaps .- 1) .& (periods .> 0))) error("Incoherent arguments: if nx, ny, or nz is smaller than 2*overlaps[1]-1, 2*overlaps[2]-1 or 2*overlaps[3]-1, respectively, then the corresponding periodx, periody or periodz must not be set (or set 0)."); end
+    if (any(halowidths .> overlaps.÷2)) error("Incoherent arguments: halowidths cannot be greater than overlaps in each dimension."); end
     dims[(nxyz.==1).&(dims.==0)] .= 1;   # Setting any of nxyz to 1, means that the corresponding dimension must also be 1 in the global grid. Thus, the corresponding dims entry must be 1.
     if (init_MPI)  # NOTE: init MPI only, once the input arguments have been checked.
         if (MPI.Initialized()) error("MPI is already initialized. Set the argument 'init_MPI=false'."); end
@@ -91,7 +94,7 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
         neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp);
     end
     nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2
-    set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet));
+    set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet));
     if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]))"); end
     if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end
     init_timing_functions();

From 8eb82e2057b270142db9ca60a3de342a97a3d55a Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 27 Sep 2023 19:10:34 +0200
Subject: [PATCH 02/37] add halo width parameter and GGField

---
 src/shared.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/shared.jl b/src/shared.jl
index 8455714..23d8d58 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -41,6 +41,7 @@ const DEVICE_TYPE_AMDGPU = "AMDGPU"
 const GGInt        = Cint
 const GGNumber     = Number
 const GGArray{T,N} = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
+const GGField{T,N} = NamedTuple{(:A, :halowidths), Tuple{GGArray{T,N}, Tuple{Int,Int,Int}}}
 
 "An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests)
 struct GlobalGrid
@@ -48,6 +49,7 @@ struct GlobalGrid
     nxyz::Vector{GGInt}
     dims::Vector{GGInt}
     overlaps::Vector{GGInt}
+    halowidths::Vector{GGInt}
     nprocs::GGInt
     me::GGInt
     coords::Vector{GGInt}
@@ -63,7 +65,7 @@ struct GlobalGrid
     loopvectorization::Vector{Bool}
     quiet::Bool
 end
-const GLOBAL_GRID_NULL = GlobalGrid(GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], -1, -1, GGInt[-1,-1,-1], GGInt[-1 -1 -1; -1 -1 -1], GGInt[-1,-1,-1], -1, -1, MPI.COMM_NULL, false, false, [false,false,false], [false,false,false], [false,false,false], false)
+const GLOBAL_GRID_NULL = GlobalGrid(GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], -1, -1, GGInt[-1,-1,-1], GGInt[-1 -1 -1; -1 -1 -1], GGInt[-1,-1,-1], -1, -1, MPI.COMM_NULL, false, false, [false,false,false], [false,false,false], [false,false,false], false)
 
 # Macro to switch on/off check_initialized() for performance reasons (potentially relevant for tools.jl).
 macro check_initialized() :(check_initialized();) end  #FIXME: Alternative: macro check_initialized() end

From 7d373d46f3d0b9e1c797bfbf9e2b54a03d5f6bd6 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 27 Sep 2023 19:12:24 +0200
Subject: [PATCH 03/37] adjust unit tests to new overlaps keyword

---
 test/test_gather.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_gather.jl b/test/test_gather.jl
index 2a68b46..42cc4af 100644
--- a/test/test_gather.jl
+++ b/test/test_gather.jl
@@ -35,7 +35,7 @@ dz = 1.0
 
     @testset "2. gather!" begin
         @testset "1D" begin
-            me, dims = init_global_grid(nx, 1, 1, overlapx=0, quiet=true, init_MPI=false);
+            me, dims = init_global_grid(nx, 1, 1, overlaps=(0,0,0), quiet=true, init_MPI=false);
             P   = zeros(nx);
             P_g = zeros(nx*dims[1]);
             P  .= [x_g(ix,dx,P) for ix=1:size(P,1)];
@@ -46,7 +46,7 @@ dz = 1.0
             finalize_global_grid(finalize_MPI=false);
         end;
         @testset "2D" begin
-            me, dims = init_global_grid(nx, ny, 1, overlapx=0, overlapy=0, quiet=true, init_MPI=false);
+            me, dims = init_global_grid(nx, ny, 1, overlaps=(0,0,0), quiet=true, init_MPI=false);
             P   = zeros(nx, ny);
             P_g = zeros(nx*dims[1], ny*dims[2]);
             P  .= [y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2)];
@@ -57,7 +57,7 @@ dz = 1.0
             finalize_global_grid(finalize_MPI=false);
         end;
         @testset "3D" begin
-            me, dims = init_global_grid(nx, ny, nz, overlapx=0, overlapy=0, overlapz=0, quiet=true, init_MPI=false);
+            me, dims = init_global_grid(nx, ny, nz, overlaps=(0,0,0), quiet=true, init_MPI=false);
             P   = zeros(nx, ny, nz);
             P_g = zeros(nx*dims[1], ny*dims[2], nz*dims[3]);
             P  .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
@@ -68,7 +68,7 @@ dz = 1.0
             finalize_global_grid(finalize_MPI=false);
         end;
         @testset "1D, then larger 3D, then smaller 2D" begin
-            me, dims = init_global_grid(nx, ny, nz, overlapx=0, overlapy=0, overlapz=0, quiet=true, init_MPI=false);
+            me, dims = init_global_grid(nx, ny, nz, overlaps=(0,0,0), quiet=true, init_MPI=false);
             # (1D)
             P   = zeros(nx);
             P_g = zeros(nx*dims[1], dims[2], dims[3]);
@@ -96,7 +96,7 @@ dz = 1.0
             finalize_global_grid(finalize_MPI=false);
         end;
         @testset "Float32, then Float64, then Int16" begin
-            me, dims = init_global_grid(nx, ny, nz, overlapx=0, overlapy=0, overlapz=0, quiet=true, init_MPI=false);
+            me, dims = init_global_grid(nx, ny, nz, overlaps=(0,0,0), quiet=true, init_MPI=false);
             # Float32 (1D)
             P   = zeros(Float32, nx);
             P_g = zeros(Float32, nx*dims[1], dims[2], dims[3]);
@@ -136,7 +136,7 @@ dz = 1.0
             end;
         end
         @testset "nothing on non-root" begin
-            me, dims = init_global_grid(nx, 1, 1, overlapx=0, quiet=true, init_MPI=false);
+            me, dims = init_global_grid(nx, 1, 1, overlaps=(0,0,0), quiet=true, init_MPI=false);
             P   = zeros(nx);
             P_g = (me == 0) ? zeros(nx*dims[1]) : nothing
             P  .= [x_g(ix,dx,P) for ix=1:size(P,1)];

From 8f6209ae3c22735c4cd40313510de8dd0703aed9 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 27 Sep 2023 19:12:40 +0200
Subject: [PATCH 04/37] adjust unit tests to new overlaps keyword

---
 test/test_init_global_grid.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_init_global_grid.jl b/test/test_init_global_grid.jl
index f24343e..fe90f58 100644
--- a/test/test_init_global_grid.jl
+++ b/test/test_init_global_grid.jl
@@ -76,7 +76,7 @@ nz = 1;
         nz  = 8;
         olz = 3;
         olx = 3;
-        init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlapx=olx, overlapz=olz, quiet=true, init_MPI=false);
+        init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlaps=(olx, 2, olz), quiet=true, init_MPI=false);
         @testset "initialized" begin
             @test GG.grid_is_initialized()
         end
@@ -99,7 +99,7 @@ nz = 1;
         @test_throws ErrorException init_global_grid(nx, 1, nz, quiet=true, init_MPI=false);                         # Error: ny==1, while nz>1.
         @test_throws ErrorException init_global_grid(nx, ny, 1, dimz=3, quiet=true, init_MPI=false);                 # Error: dimz>1 while nz==1.
         @test_throws ErrorException init_global_grid(nx, ny, 1, periodz=1, quiet=true, init_MPI=false);              # Error: periodz==1 while nz==1.
-        @test_throws ErrorException init_global_grid(nx, ny, nz, periody=1, overlapy=3, quiet=true, init_MPI=false); # Error: periody==1 while ny<2*overlapy-1 (4<5).
+        @test_throws ErrorException init_global_grid(nx, ny, nz, periody=1, overlaps=(2,3,2), quiet=true, init_MPI=false); # Error: periody==1 while ny<2*overlaps[2]-1 (4<5).
         @test_throws ErrorException init_global_grid(nx, ny, nz, quiet=true);                                        # Error: MPI already initialized
         @testset "already initialized exception" begin
             init_global_grid(nx, ny, nz, quiet=true, init_MPI=false);

From c9a32111d2c7bd7d80e410789bfb18f7141de387 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 27 Sep 2023 19:12:51 +0200
Subject: [PATCH 05/37] adjust unit tests to new overlaps keyword

---
 test/test_tools.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_tools.jl b/test/test_tools.jl
index 30083f3..fdcf432 100644
--- a/test/test_tools.jl
+++ b/test/test_tools.jl
@@ -77,7 +77,7 @@ nprocs = MPI.Comm_size(MPI.COMM_WORLD);
         Vz  = zeros(nx,  ny,  nz+1);
         A   = zeros(nx,  ny,  nz+2);
         Sxz = zeros(nx-2,ny-1,nz-2);
-        init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlapx=3, overlapz=3, quiet=true, init_MPI=false);
+        init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlaps=(3,2,3), quiet=true, init_MPI=false);
         @testset "nx_g / ny_g / nz_g" begin
             @test nx_g() == nx
             @test ny_g() == ny

From b8aa1e7eae1759cf6f18efe9072bff002ee3be87 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 27 Sep 2023 19:12:59 +0200
Subject: [PATCH 06/37] adjust unit tests to new overlaps keyword

---
 test/test_update_halo.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index a737bc1..e58120f 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -213,7 +213,7 @@ dz = 1.0
     @testset "3. data transfer components" begin
         @testset "iwrite_sendbufs! / iread_recvbufs!" begin
             @testset "sendranges / recvranges ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                 P   = zeros(nx,  ny,  nz  );
                 A   = zeros(nx-1,ny+2,nz+1);
                 @test GG.sendranges(1, 1, P) == [                    2:2,             1:size(P,2),             1:size(P,3)]
@@ -409,7 +409,7 @@ dz = 1.0
                 end;
             end
             @testset "iwrite_sendbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                 P = zeros(nx,  ny,  nz  );
                 A = zeros(nx-1,ny+2,nz+1);
                 P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]);
@@ -497,7 +497,7 @@ dz = 1.0
                 finalize_global_grid(finalize_MPI=false);
             end;
             @testset "iread_recvbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                 P = zeros(nx,  ny,  nz  );
                 A = zeros(nx-1,ny+2,nz+1);
                 GG.allocate_bufs(P, A);
@@ -611,7 +611,7 @@ dz = 1.0
             end;
             if (nprocs==1)
                 @testset "sendrecv_halo_local ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-                    init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                    init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                     P = zeros(nx,  ny,  nz  );
                     A = zeros(nx-1,ny+2,nz+1);
                     GG.allocate_bufs(P, A);
@@ -789,7 +789,7 @@ dz = 1.0
                 finalize_global_grid(finalize_MPI=false);
             end;
             @testset "3D (non-default overlap)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=4, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), quiet=true, init_MPI=false, device_type=device_type);
                 P     = zeros(nx, ny, nz);
                 P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
                 P_ref = copy(P);
@@ -869,7 +869,7 @@ dz = 1.0
                 finalize_global_grid(finalize_MPI=false);
             end;
             @testset "3D (non-default overlap)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=3, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(3,2,3), quiet=true, init_MPI=false, device_type=device_type);
                 Vx     = zeros(nx+1,ny,nz);
                 Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
                 Vx_ref = copy(Vx);

From 2ac9af489c26853c4b726aff704aad6a01c3ead2 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 27 Sep 2023 19:20:25 +0200
Subject: [PATCH 07/37] add unit tests for halowidth default initialization

---
 test/test_init_global_grid.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_init_global_grid.jl b/test/test_init_global_grid.jl
index fe90f58..84bd88b 100644
--- a/test/test_init_global_grid.jl
+++ b/test/test_init_global_grid.jl
@@ -100,6 +100,7 @@ nz = 1;
         @test_throws ErrorException init_global_grid(nx, ny, 1, dimz=3, quiet=true, init_MPI=false);                 # Error: dimz>1 while nz==1.
         @test_throws ErrorException init_global_grid(nx, ny, 1, periodz=1, quiet=true, init_MPI=false);              # Error: periodz==1 while nz==1.
         @test_throws ErrorException init_global_grid(nx, ny, nz, periody=1, overlaps=(2,3,2), quiet=true, init_MPI=false); # Error: periody==1 while ny<2*overlaps[2]-1 (4<5).
+        @test_throws ErrorException init_global_grid(nx, ny, nz, overlaps=(4,3,2), halowidths=(2,2,1), quiet=true, init_MPI=false); # Error: halowidths[2]==2 while overlaps[2]==3.
         @test_throws ErrorException init_global_grid(nx, ny, nz, quiet=true);                                        # Error: MPI already initialized
         @testset "already initialized exception" begin
             init_global_grid(nx, ny, nz, quiet=true, init_MPI=false);

From 11ac292ecc548ccb4237eb29b8bdca8a10df93cd Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 28 Sep 2023 18:01:27 +0200
Subject: [PATCH 08/37] introduce GGFieldConvertible

---
 src/shared.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/shared.jl b/src/shared.jl
index 23d8d58..c918df1 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -38,10 +38,12 @@ const DEVICE_TYPE_AMDGPU = "AMDGPU"
 ##------
 ## TYPES
 
-const GGInt        = Cint
-const GGNumber     = Number
-const GGArray{T,N} = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
-const GGField{T,N} = NamedTuple{(:A, :halowidths), Tuple{GGArray{T,N}, Tuple{Int,Int,Int}}}
+const GGInt                    = Cint
+const GGNumber                 = Number
+const GGArray{T,N}             = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
+const GGField{T,N}             = NamedTuple{(:A, :halowidths), Tuple{GGArray{T,N}, Tuple{GGInt,GGInt,GGInt}}}
+const GGFieldConvertible{T,N}  = NamedTuple{(:A, :halowidths), <:Tuple{GGArray{T,N}, Tuple{T2,T2,T2}}} where {T2<:Integer}
+const GGField{}(t::NamedTuple) = GGField{eltype(t.A),ndims(t.A)}((t.A, GGInt.(t.halowidths)))
 
 "An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests)
 struct GlobalGrid
@@ -94,6 +96,8 @@ me()                                   = global_grid().me
 comm()                                 = global_grid().comm
 ol(dim::Integer)                       = global_grid().overlaps[dim]
 ol(dim::Integer, A::GGArray)           = global_grid().overlaps[dim] + (size(A,dim) - global_grid().nxyz[dim])
+ol(A::GGArray)                         = (ol(dim, A) for dim in 1:ndims(A))
+hw_default()                           = global_grid().halowidths
 neighbors(dim::Integer)                = global_grid().neighbors[:,dim]
 neighbor(n::Integer, dim::Integer)     = global_grid().neighbors[n,dim]
 cuda_enabled()                         = global_grid().cuda_enabled

From af1159078f6140f5556870d0cdcb52ad140d5f11 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 28 Sep 2023 18:04:54 +0200
Subject: [PATCH 09/37] add conversion to GGField and check local halowidth

---
 src/update_halo.jl | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index ad25e7f..1c8218a 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -22,9 +22,15 @@ Update the halo of the given GPU/CPU-array(s).
     shell> export IGG_ROCMAWARE_MPI=1
     ```
 """
-function update_halo!(A::GGArray...)
+function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...)
     check_initialized();
-    check_fields(A...);
+    fields = map(A) do A_i
+        if     isa(A_i, GGField)            return A_i
+        elseif isa(A_i, GGFieldConvertible) return GGField(A_i)
+        elseif isa(A_i, GGArray)            return GGField{eltype(A_i), ndims(A_i)}((A_i, (hw_default()...,)))
+        end
+    end
+    check_fields(fields...);
     _update_halo!(A...);  # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user).
     return nothing
 end
@@ -801,12 +807,13 @@ end
 ##-------------------------------------------
 ## FUNCTIONS FOR CHECKING THE INPUT ARGUMENTS
 
-function check_fields(fields::GGArray...)
-    # Raise an error if any of the given fields does not have a halo.
+# NOTE: no comparison must be done between the field-local halowidths and field-local overlaps because any combination is valid: the rational is that a field has simply no halo but only computation overlap in a given dimension if the corresponding local overlap is less than 2 times the local halowidth. This allows to determine whether a halo update needs to be done in a certain dimension or not.
+function check_fields(fields::GGField...)
+    # Raise an error if any of the given fields has no halo at all (in any dimension) - in this case there is no halo update to do and including the field in the call is inconsistent.
     no_halo = Int[];
     for i = 1:length(fields)
-        A = fields[i];
-        if all([ol(dim, A) < 2 for dim = 1:ndims(A)]) # There is no halo if the overlap is less than 2...
+        A, halowidths = fields[i]
+        if all([halowidths[dim]==0 || (ol(dim, A) < 2*halowidths[dim]) for dim = 1:ndims(A)]) # There is no halo if the overlap is less than 2 times the halowidth (only computation overlap in this case)...
             push!(no_halo, i);
         end
     end
@@ -817,7 +824,7 @@ function check_fields(fields::GGArray...)
     end
 
     # Raise an error if any of the given fields contains any duplicates.
-    duplicates = [[i,j] for i=1:length(fields) for j=i+1:length(fields) if fields[i]===fields[j]];
+    duplicates = [[i,j] for i=1:length(fields) for j=i+1:length(fields) if fields[i].A===fields[j].A];
     if length(duplicates) > 2
         error("The pairs of fields with the positions $(join(duplicates,", "," and ")) are the same; remove any duplicates from the call.")
     elseif length(duplicates) > 0
@@ -825,7 +832,7 @@ function check_fields(fields::GGArray...)
     end
 
     # Raise an error if not all fields are of the same datatype (restriction comes from buffer handling).
-    different_types = [i for i=2:length(fields) if typeof(fields[i])!=typeof(fields[1])];
+    different_types = [i for i=2:length(fields) if typeof(fields[i].A)!=typeof(fields[1].A)];
     if length(different_types) > 1
         error("The fields at positions $(join(different_types,", "," and ")) are of different type than the first field; make sure that in a same call all fields are of the same type.")
     elseif length(different_types) == 1

From f58fca5590d5917a75a87c317bc166f06b2361aa Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 28 Sep 2023 18:05:55 +0200
Subject: [PATCH 10/37] add unit test for halo width initialization

---
 test/test_init_global_grid.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_init_global_grid.jl b/test/test_init_global_grid.jl
index 84bd88b..ba975fb 100644
--- a/test/test_init_global_grid.jl
+++ b/test/test_init_global_grid.jl
@@ -36,6 +36,7 @@ nz = 1;
             @test GG.global_grid().nxyz      == [nx, ny, nz]
             @test GG.global_grid().dims      == dims
             @test GG.global_grid().overlaps  == [2, 2, 2]
+            @test GG.global_grid().halowidths== [1, 1, 1]
             @test GG.global_grid().nprocs    == nprocs
             @test GG.global_grid().me        == me
             @test GG.global_grid().coords    == coords

From 1aada574db3834ac8c4b3ff4088907ea640fc355 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 28 Sep 2023 18:07:13 +0200
Subject: [PATCH 11/37] add exception tests for local halowidths

---
 test/test_update_halo.jl | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index e58120f..9dc0c3f 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -66,14 +66,16 @@ dz = 1.0
         A2  = A;
         Z   = zeros(ComplexF64, nx-1,ny+2,nz+1);
         Z2  = Z;
-        @test_throws ErrorException update_halo!(P, Sxz, A)       # Error: Sxz has no halo.
-        @test_throws ErrorException update_halo!(P, Sxz, A, Sxz)  # Error: Sxz and Sxz have no halo.
-        @test_throws ErrorException update_halo!(P, A, A)         # Error: A is given twice.
-        @test_throws ErrorException update_halo!(P, A, A2)        # Error: A2 is duplicate of A (an alias; it points to the same memory).
-        @test_throws ErrorException update_halo!(P, A, A, A2)     # Error: the second A and A2 are duplicates of the first A.
-        @test_throws ErrorException update_halo!(Z, Z2)           # Error: Z2 is duplicate of Z (an alias; it points to the same memory).
-        @test_throws ErrorException update_halo!(Z, P)            # Error: P is of different type than Z.
-        @test_throws ErrorException update_halo!(Z, P, A)         # Error: P and A are of different type than Z.
+        @test_throws ErrorException update_halo!(P, Sxz, A)                     # Error: Sxz has no halo.
+        @test_throws ErrorException update_halo!(P, Sxz, A, Sxz)                # Error: Sxz and Sxz have no halo.
+        @test_throws ErrorException update_halo!(A, (A=P, halowidths=(2,2,2)))  # Error: P has no halo.
+        @test_throws ErrorException update_halo!((A=A, halowidths=(0,3,2)), (A=P, halowidths=(2,2,2)))  # Error: A and P have no halo.
+        @test_throws ErrorException update_halo!(P, A, A)                       # Error: A is given twice.
+        @test_throws ErrorException update_halo!(P, A, A2)                      # Error: A2 is duplicate of A (an alias; it points to the same memory).
+        @test_throws ErrorException update_halo!(P, A, A, A2)                   # Error: the second A and A2 are duplicates of the first A.
+        @test_throws ErrorException update_halo!(Z, Z2)                         # Error: Z2 is duplicate of Z (an alias; it points to the same memory).
+        @test_throws ErrorException update_halo!(Z, P)                          # Error: P is of different type than Z.
+        @test_throws ErrorException update_halo!(Z, P, A)                       # Error: P and A are of different type than Z.
         finalize_global_grid(finalize_MPI=false);
     end;
 

From 0a2472bef62816d93717bc8ebb585f15340a0e59 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 2 Oct 2023 15:41:23 +0200
Subject: [PATCH 12/37] adapt array checks for fields

---
 src/shared.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/shared.jl b/src/shared.jl
index c918df1..cf92b4c 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -109,9 +109,9 @@ amdgpuaware_MPI(dim::Integer)          = global_grid().amdgpuaware_MPI[dim]
 loopvectorization()                    = global_grid().loopvectorization
 loopvectorization(dim::Integer)        = global_grid().loopvectorization[dim]
 has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.PROC_NULL
-any_array(fields::GGArray...)          = any([is_array(A) for A in fields])
-any_cuarray(fields::GGArray...)        = any([is_cuarray(A) for A in fields])
-any_rocarray(fields::GGArray...)       = any([is_rocarray(A) for A in fields])
+any_array(fields::GGField...)          = any([is_array(A.A) for A in fields])
+any_cuarray(fields::GGField...)        = any([is_cuarray(A.A) for A in fields])
+any_rocarray(fields::GGField...)       = any([is_rocarray(A.A) for A in fields])
 is_array(A::GGArray)                   = typeof(A) <: Array
 is_cuarray(A::GGArray)                 = typeof(A) <: CuArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 is_rocarray(A::GGArray)                = typeof(A) <: ROCArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.

From bfa8289c4d3d466762b66dfe569be8d519829ad9 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 2 Oct 2023 15:43:58 +0200
Subject: [PATCH 13/37] introduce arrays for step by step replacement by fields

---
 src/update_halo.jl | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 1c8218a..b9ee091 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -31,51 +31,51 @@ function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...)
         end
     end
     check_fields(fields...);
-    _update_halo!(A...);  # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user).
+    _update_halo!(A, fields...);  # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user).
     return nothing
 end
 
-function _update_halo!(fields::GGArray...)
+function _update_halo!(arrays, fields::GGField...)
     if (any_cuarray(fields...) && !cuda_enabled())    error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end    #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`.
     if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end  #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`.
-    allocate_bufs(fields...);
-    if any_array(fields...) allocate_tasks(fields...); end
-    if any_cuarray(fields...) allocate_custreams(fields...); end
-    if any_rocarray(fields...) allocate_rocstreams(fields...); end
+    allocate_bufs(arrays...);
+    if any_array(fields...) allocate_tasks(arrays...); end
+    if any_cuarray(fields...) allocate_custreams(arrays...); end
+    if any_rocarray(fields...) allocate_rocstreams(arrays...); end
 
     for dim = 1:NDIMS_MPI  # NOTE: this works for 1D-3D (e.g. if nx>1, ny>1 and nz=1, then for d=3, there will be no neighbors, i.e. nothing will be done as desired...).
         for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
-            if has_neighbor(ns, dim) iwrite_sendbufs!(ns, dim, fields[i], i); end
+            if has_neighbor(ns, dim) iwrite_sendbufs!(ns, dim, arrays[i], i); end
         end
         # Send / receive if the neighbors are other processes (usual case).
         reqs = fill(MPI.REQUEST_NULL, length(fields), NNEIGHBORS_PER_DIM, 2);
         if all(neighbors(dim) .!= me())  # Note: handling of send/recv to itself requires special configurations for some MPI implementations (e.g. self BTL must be activated with OpenMPI); so we handle this case without MPI to avoid this complication.
             for nr = NNEIGHBORS_PER_DIM:-1:1,  i = 1:length(fields) # Note: if there were indeed more than 2 neighbors per dimension; then one would need to make sure which neigbour would communicate with which.
-                if has_neighbor(nr, dim) reqs[i,nr,1] = irecv_halo!(nr, dim, fields[i], i); end
+                if has_neighbor(nr, dim) reqs[i,nr,1] = irecv_halo!(nr, dim, arrays[i], i); end
             end
             for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
                 if has_neighbor(ns, dim)
-                    wait_iwrite(ns, fields[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
-                    reqs[i,ns,2] = isend_halo(ns, dim, fields[i], i);
+                    wait_iwrite(ns, arrays[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
+                    reqs[i,ns,2] = isend_halo(ns, dim, arrays[i], i);
                 end
             end
         # Copy if I am my own neighbors (when periodic boundary and only one process in this dimension).
         elseif all(neighbors(dim) .== me())
             for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
-                wait_iwrite(ns, fields[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
-                sendrecv_halo_local(ns, dim, fields[i], i);
+                wait_iwrite(ns, arrays[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
+                sendrecv_halo_local(ns, dim, arrays[i], i);
                 nr = NNEIGHBORS_PER_DIM - ns + 1;
-                iread_recvbufs!(nr, dim, fields[i], i);
+                iread_recvbufs!(nr, dim, arrays[i], i);
             end
         else
             error("Incoherent neighbors in dimension $dim: either all neighbors must equal to me, or none.")
         end
         for nr = NNEIGHBORS_PER_DIM:-1:1,  i = 1:length(fields)  # Note: if there were indeed more than 2 neighbors per dimension; then one would need to make sure which neigbour would communicate with which.
             if (reqs[i,nr,1]!=MPI.REQUEST_NULL) MPI.Wait!(reqs[i,nr,1]); end
-            if (has_neighbor(nr, dim) && neighbor(nr, dim)!=me()) iread_recvbufs!(nr, dim, fields[i], i); end  # Note: if neighbor(nr,dim) != me() is done directly in the sendrecv_halo_local loop above for better performance (thanks to pipelining)
+            if (has_neighbor(nr, dim) && neighbor(nr, dim)!=me()) iread_recvbufs!(nr, dim, arrays[i], i); end  # Note: if neighbor(nr,dim) != me() is done directly in the sendrecv_halo_local loop above for better performance (thanks to pipelining)
         end
         for nr = NNEIGHBORS_PER_DIM:-1:1,  i = 1:length(fields) # Note: if there were indeed more than 2 neighbors per dimension; then one would need to make sure which neigbour would communicate with which.
-            if has_neighbor(nr, dim) wait_iread(nr, fields[i], i); end
+            if has_neighbor(nr, dim) wait_iread(nr, arrays[i], i); end
         end
         for ns = 1:NNEIGHBORS_PER_DIM
             if (any(reqs[:,ns,2].!=[MPI.REQUEST_NULL])) MPI.Waitall!(reqs[:,ns,2]); end

From 3e15bb75e710297b5542880b2f3253ffc699632f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 2 Oct 2023 18:45:25 +0200
Subject: [PATCH 14/37] generalization for fields of allocate_bufs

---
 src/update_halo.jl | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index b9ee091..ab12f56 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -24,12 +24,7 @@ Update the halo of the given GPU/CPU-array(s).
 """
 function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...)
     check_initialized();
-    fields = map(A) do A_i
-        if     isa(A_i, GGField)            return A_i
-        elseif isa(A_i, GGFieldConvertible) return GGField(A_i)
-        elseif isa(A_i, GGArray)            return GGField{eltype(A_i), ndims(A_i)}((A_i, (hw_default()...,)))
-        end
-    end
+    fields = wrap_field.(A);
     check_fields(fields...);
     _update_halo!(A, fields...);  # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user).
     return nothing
@@ -38,7 +33,7 @@ end
 function _update_halo!(arrays, fields::GGField...)
     if (any_cuarray(fields...) && !cuda_enabled())    error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end    #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`.
     if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end  #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`.
-    allocate_bufs(arrays...);
+    allocate_bufs(fields...);
     if any_array(fields...) allocate_tasks(arrays...); end
     if any_cuarray(fields...) allocate_custreams(arrays...); end
     if any_rocarray(fields...) allocate_rocstreams(arrays...); end
@@ -153,7 +148,7 @@ let
     end
 
     # Allocate for each field two send and recv buffers (one for the left and one for the right neighbour of a dimension). The required length of the buffer is given by the maximal number of halo elements in any of the dimensions. Note that buffers are not allocated separately for each dimension, as the updates are performed one dimension at a time (required for correctness).
-    function allocate_bufs(fields::GGArray{T}...) where T <: GGNumber
+    function allocate_bufs(fields::GGField{T}...) where T <: GGNumber
         if (isnothing(sendbufs_raw) || isnothing(recvbufs_raw))
             free_update_halo_buffers();
             init_bufs_arrays();
@@ -164,13 +159,13 @@ let
         if cuda_enabled() init_cubufs(T, fields...); end
         if amdgpu_enabled() init_rocbufs(T, fields...); end
         for i = 1:length(fields)
-            A = fields[i];
+            A, halowidths = fields[i];
             for n = 1:NNEIGHBORS_PER_DIM # Ensure that the buffers are interpreted to contain elements of the same type as the array.
                 reinterpret_bufs(T, i, n);
                 if cuda_enabled() reinterpret_cubufs(T, i, n); end
                 if amdgpu_enabled() reinterpret_rocbufs(T, i, n); end
             end
-            max_halo_elems = (ndims(A) > 1) ? prod(sort([size(A)...])[2:end]) : 1;
+            max_halo_elems = (ndims(A) > 1) ? maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1])) : halowidths[1];
             if (length(sendbufs_raw[i][1]) < max_halo_elems)
                 for n = 1:NNEIGHBORS_PER_DIM
                     reallocate_bufs(T, i, n, max_halo_elems);
@@ -200,7 +195,7 @@ let
         recvbufs_raw = Array{Array{Any,1},1}();
     end
 
-    function init_bufs(T::DataType, fields::GGArray...)
+    function init_bufs(T::DataType, fields::GGField...)
         while (length(sendbufs_raw) < length(fields)) push!(sendbufs_raw, [zeros(T,0), zeros(T,0)]); end
         while (length(recvbufs_raw) < length(fields)) push!(recvbufs_raw, [zeros(T,0), zeros(T,0)]); end
     end
@@ -225,7 +220,7 @@ let
         curecvbufs_raw_h = Array{Array{Any,1},1}();
     end
 
-    function init_cubufs(T::DataType, fields::GGArray...)
+    function init_cubufs(T::DataType, fields::GGField...)
         while (length(cusendbufs_raw) < length(fields)) push!(cusendbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end
         while (length(curecvbufs_raw) < length(fields)) push!(curecvbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end
         while (length(cusendbufs_raw_h) < length(fields)) push!(cusendbufs_raw_h, [[], []]); end
@@ -258,7 +253,7 @@ let
         # INFO: no need for roc host buffers
     end
 
-    function init_rocbufs(T::DataType, fields::GGArray...)
+    function init_rocbufs(T::DataType, fields::GGField...)
         while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
         while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
         # INFO: no need for roc host buffers
@@ -804,8 +799,17 @@ function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
 end
 
 
-##-------------------------------------------
-## FUNCTIONS FOR CHECKING THE INPUT ARGUMENTS
+##--------------------------------------------------------
+## FUNCTIONS FOR WRAPPING AND CHECKING THE INPUT ARGUMENTS
+
+
+# Wrap the input argument into a GGField.
+wrap_field(A::GGField)                 = A
+wrap_field(A::GGFieldConvertible)      = GGField(A)
+wrap_field(A::GGArray, hw::Tuple)      = GGField{eltype(A), ndims(A)}((A, hw))
+wrap_field(A::GGArray, hw::Integer...) = wrap_field(A, hw)
+wrap_field(A::GGArray)                 = wrap_field(A, hw_default()...)
+
 
 # NOTE: no comparison must be done between the field-local halowidths and field-local overlaps because any combination is valid: the rational is that a field has simply no halo but only computation overlap in a given dimension if the corresponding local overlap is less than 2 times the local halowidth. This allows to determine whether a halo update needs to be done in a certain dimension or not.
 function check_fields(fields::GGField...)

From 55850b8a58eaa35f92000941eba693a0402f5845 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 2 Oct 2023 18:46:50 +0200
Subject: [PATCH 15/37] generalization for fields of allocate_bufs

---
 test/test_update_halo.jl | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index 9dc0c3f..22118d5 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -90,7 +90,7 @@ dz = 1.0
         @testset "free buffers" begin
             @require GG.get_sendbufs_raw() === nothing
             @require GG.get_recvbufs_raw() === nothing
-            GG.allocate_bufs(P);
+            GG.allocate_bufs(GG.wrap_field(P));
             @require GG.get_sendbufs_raw() !== nothing
             @require GG.get_recvbufs_raw() !== nothing
             GG.free_update_halo_buffers();
@@ -99,7 +99,7 @@ dz = 1.0
         end;
         @testset "allocate single" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(P);
+            GG.allocate_bufs(GG.wrap_field(P));
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 1                                    # 1 array
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -110,7 +110,7 @@ dz = 1.0
         end;
         @testset "allocate single (Complex)" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(Z);
+            GG.allocate_bufs(GG.wrap_field(Z));
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 1                                    # 1 array
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -121,8 +121,8 @@ dz = 1.0
         end;
         @testset "keep 1st, allocate 2nd" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(P);
-            GG.allocate_bufs(A, P);
+            GG.allocate_bufs(GG.wrap_field(P));
+            GG.allocate_bufs(GG.wrap_field.((A, P))...);
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 2                                    # 2 arrays
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -135,8 +135,8 @@ dz = 1.0
         end;
         @testset "keep 1st, allocate 2nd (Complex)" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(Z);
-            GG.allocate_bufs(Y, Z);
+            GG.allocate_bufs(GG.wrap_field(Z));
+            GG.allocate_bufs(GG.wrap_field.((Y, Z))...);
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 2                                    # 2 arrays
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -149,8 +149,8 @@ dz = 1.0
         end;
         @testset "reinterpret (no allocation)" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(A, P);
-            GG.allocate_bufs(B, C);                                                  # The new arrays contain Float32 (A, and P were Float64); B and C have a halo with more elements than A and P had, but they require less space in memory
+            GG.allocate_bufs(GG.wrap_field.((A, P))...);
+            GG.allocate_bufs(GG.wrap_field.((B, C))...);                                                  # The new arrays contain Float32 (A, and P were Float64); B and C have a halo with more elements than A and P had, but they require less space in memory
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 2                                    # Still 2 arrays: B, C (even though they are different then before: was A and P)
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -164,8 +164,8 @@ dz = 1.0
         end;
         @testset "reinterpret (no allocation) (Complex)" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(A, P);
-            GG.allocate_bufs(Y, Z);                                                  # The new arrays contain Float32 (A, and P were Float64); B and C have a halo with more elements than A and P had, but they require less space in memory
+            GG.allocate_bufs(GG.wrap_field.((A, P))...);
+            GG.allocate_bufs(GG.wrap_field.((Y, Z))...);                                                  # The new arrays contain Float32 (A, and P were Float64); B and C have a halo with more elements than A and P had, but they require less space in memory
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 2                                    # Still 2 arrays: B, C (even though they are different then before: was A and P)
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -183,7 +183,7 @@ dz = 1.0
                 sendbuf, recvbuf = (GG.gpusendbuf, GG.gpurecvbuf);
             end
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(A, P);
+            GG.allocate_bufs(GG.wrap_field.((A, P))...);
             for dim = 1:ndims(A), n = 1:nneighbors_per_dim
                 @test all(size(sendbuf(n,dim,1,A)) .== size(A)[1:ndims(A).!=dim])
                 @test all(size(recvbuf(n,dim,1,A)) .== size(A)[1:ndims(A).!=dim])
@@ -199,7 +199,7 @@ dz = 1.0
                 sendbuf, recvbuf = (GG.gpusendbuf, GG.gpurecvbuf);
             end
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(Y, Z);
+            GG.allocate_bufs(GG.wrap_field.((Y, Z))...);
             for dim = 1:ndims(Y), n = 1:nneighbors_per_dim
                 @test all(size(sendbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
                 @test all(size(recvbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
@@ -416,7 +416,7 @@ dz = 1.0
                 A = zeros(nx-1,ny+2,nz+1);
                 P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]);
                 A .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]);
-                GG.allocate_bufs(P, A);
+                GG.allocate_bufs(GG.wrap_field.((P, A))...);
                 if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
                 elseif (array_type == "AMDGPU") GG.allocate_rocstreams(P, A);
                 else                            GG.allocate_tasks(P, A);
@@ -502,7 +502,7 @@ dz = 1.0
                 init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                 P = zeros(nx,  ny,  nz  );
                 A = zeros(nx-1,ny+2,nz+1);
-                GG.allocate_bufs(P, A);
+                GG.allocate_bufs(GG.wrap_field.((P, A))...);
                 if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
                 elseif (array_type == "AMDGPU") GG.allocate_rocstreams(P, A);
                 else                            GG.allocate_tasks(P, A);
@@ -616,7 +616,7 @@ dz = 1.0
                     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                     P = zeros(nx,  ny,  nz  );
                     A = zeros(nx-1,ny+2,nz+1);
-                    GG.allocate_bufs(P, A);
+                    GG.allocate_bufs(GG.wrap_field.((P, A))...);
                     dim = 1
                     for n = 1:nneighbors_per_dim
                         if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
@@ -702,7 +702,7 @@ dz = 1.0
                 P   = zeros(nx,ny,nz);
                 A   = zeros(nx,ny,nz);
                 dim = 1;
-                GG.allocate_bufs(P, A);
+                GG.allocate_bufs(GG.wrap_field.((P, A))...);
                 for n = 1:nneighbors_per_dim
                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
                         GG.gpusendbuf(n,dim,1,P) .= 9.0;

From 644888cc4997403e58e95085708fea306256898a Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 2 Oct 2023 19:02:42 +0200
Subject: [PATCH 16/37] generalization for fields of allocate tasks and streams

---
 src/update_halo.jl | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index ab12f56..47610bf 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -34,9 +34,9 @@ function _update_halo!(arrays, fields::GGField...)
     if (any_cuarray(fields...) && !cuda_enabled())    error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end    #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`.
     if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end  #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`.
     allocate_bufs(fields...);
-    if any_array(fields...) allocate_tasks(arrays...); end
-    if any_cuarray(fields...) allocate_custreams(arrays...); end
-    if any_rocarray(fields...) allocate_rocstreams(arrays...); end
+    if any_array(fields...) allocate_tasks(fields...); end
+    if any_cuarray(fields...) allocate_custreams(fields...); end
+    if any_rocarray(fields...) allocate_rocstreams(fields...); end
 
     for dim = 1:NDIMS_MPI  # NOTE: this works for 1D-3D (e.g. if nx>1, ny>1 and nz=1, then for d=3, there will be no neighbors, i.e. nothing will be done as desired...).
         for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
@@ -348,7 +348,7 @@ end
 
 # (CPU functions)
 
-function allocate_tasks(fields::GGArray...)
+function allocate_tasks(fields::GGField...)
     allocate_tasks_iwrite(fields...);
     allocate_tasks_iread(fields...);
 end
@@ -360,7 +360,7 @@ let
 
     wait_iwrite(n::Integer, A::Array{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) # The argument A is used for multiple dispatch. #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance as the tasks are mapped only to one thread via context switching).
 
-    function allocate_tasks_iwrite(fields::GGArray...)
+    function allocate_tasks_iwrite(fields::GGField...)
         if length(fields) > size(tasks,2)  # Note: for simplicity, we create a tasks for every field even if it is not an Array
             tasks = [tasks Array{Task}(undef, NNEIGHBORS_PER_DIM, length(fields)-size(tasks,2))];  # Create (additional) emtpy tasks.
         end
@@ -386,7 +386,7 @@ let
 
     wait_iread(n::Integer, A::Array{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance currently as the tasks are mapped only to one thread via context switching).
 
-    function allocate_tasks_iread(fields::GGArray...)
+    function allocate_tasks_iread(fields::GGField...)
         if length(fields) > size(tasks,2)  # Note: for simplicity, we create a tasks for every field even if it is not an Array
             tasks = [tasks Array{Task}(undef, NNEIGHBORS_PER_DIM, length(fields)-size(tasks,2))];  # Create (additional) emtpy tasks.
         end
@@ -408,7 +408,7 @@ end
 
 # (CUDA functions)
 
-function allocate_custreams(fields::GGArray...)
+function allocate_custreams(fields::GGField...)
     allocate_custreams_iwrite(fields...);
     allocate_custreams_iread(fields...);
 end
@@ -420,7 +420,7 @@ let
 
     wait_iwrite(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
 
-    function allocate_custreams_iwrite(fields::GGArray...)
+    function allocate_custreams_iwrite(fields::GGField...)
         if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
             custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
@@ -448,7 +448,7 @@ let
 
     wait_iread(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
 
-    function allocate_custreams_iread(fields::GGArray...)
+    function allocate_custreams_iread(fields::GGField...)
         if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
             custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
@@ -472,7 +472,7 @@ end
 
 # (AMDGPU functions)
 
-function allocate_rocstreams(fields::GGArray...)
+function allocate_rocstreams(fields::GGField...)
     allocate_rocstreams_iwrite(fields...);
     allocate_rocstreams_iread(fields...);
 end
@@ -484,7 +484,7 @@ let
 
     wait_iwrite(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
 
-    function allocate_rocstreams_iwrite(fields::GGArray...)
+    function allocate_rocstreams_iwrite(fields::GGField...)
         if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCArray
             rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
@@ -514,7 +514,7 @@ let
 
     wait_iread(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
 
-    function allocate_rocstreams_iread(fields::GGArray...)
+    function allocate_rocstreams_iread(fields::GGField...)
         if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCArray
             rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end

From dac3fdbb6efc386c69133136fed9cb65bab1cfbd Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Tue, 3 Oct 2023 12:12:09 +0200
Subject: [PATCH 17/37] generalization for fields of send and receive functions

---
 src/update_halo.jl | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 47610bf..a64d699 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -46,19 +46,19 @@ function _update_halo!(arrays, fields::GGField...)
         reqs = fill(MPI.REQUEST_NULL, length(fields), NNEIGHBORS_PER_DIM, 2);
         if all(neighbors(dim) .!= me())  # Note: handling of send/recv to itself requires special configurations for some MPI implementations (e.g. self BTL must be activated with OpenMPI); so we handle this case without MPI to avoid this complication.
             for nr = NNEIGHBORS_PER_DIM:-1:1,  i = 1:length(fields) # Note: if there were indeed more than 2 neighbors per dimension; then one would need to make sure which neigbour would communicate with which.
-                if has_neighbor(nr, dim) reqs[i,nr,1] = irecv_halo!(nr, dim, arrays[i], i); end
+                if has_neighbor(nr, dim) reqs[i,nr,1] = irecv_halo!(nr, dim, fields[i], i); end
             end
             for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
                 if has_neighbor(ns, dim)
                     wait_iwrite(ns, arrays[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
-                    reqs[i,ns,2] = isend_halo(ns, dim, arrays[i], i);
+                    reqs[i,ns,2] = isend_halo(ns, dim, fields[i], i);
                 end
             end
         # Copy if I am my own neighbors (when periodic boundary and only one process in this dimension).
         elseif all(neighbors(dim) .== me())
             for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
                 wait_iwrite(ns, arrays[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
-                sendrecv_halo_local(ns, dim, arrays[i], i);
+                sendrecv_halo_local(ns, dim, fields[i], i);
                 nr = NNEIGHBORS_PER_DIM - ns + 1;
                 iread_recvbufs!(nr, dim, arrays[i], i);
             end
@@ -711,9 +711,10 @@ end
 ##------------------------------
 ## FUNCTIONS TO SEND/RECV FIELDS
 
-function irecv_halo!(n::Integer, dim::Integer, A::GGArray, i::Integer; tag::Integer=0)
+function irecv_halo!(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Integer=0)
     req = MPI.REQUEST_NULL;
-    if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
+    A, halowidths = F.A, F.halowidths;
+    if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
         if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
             req = MPI.Irecv!(gpurecvbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm());
         else
@@ -723,9 +724,10 @@ function irecv_halo!(n::Integer, dim::Integer, A::GGArray, i::Integer; tag::Inte
     return req
 end
 
-function isend_halo(n::Integer, dim::Integer, A::GGArray, i::Integer; tag::Integer=0)
+function isend_halo(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Integer=0)
     req = MPI.REQUEST_NULL;
-    if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
+    A, halowidths = F.A, F.halowidths;
+    if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
         if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
             req = MPI.Isend(gpusendbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm());
         else
@@ -735,8 +737,9 @@ function isend_halo(n::Integer, dim::Integer, A::GGArray, i::Integer; tag::Integ
     return req
 end
 
-function sendrecv_halo_local(n::Integer, dim::Integer, A::GGArray, i::Integer)
-    if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
+function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer)
+    A, halowidths = F.A, F.halowidths;
+    if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
         if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
             if n == 1
                 gpumemcopy!(gpurecvbuf_flat(2,dim,i,A), gpusendbuf_flat(1,dim,i,A));

From 923bbb23605c0571ecfa36e8807074a85e258d26 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Tue, 3 Oct 2023 12:19:46 +0200
Subject: [PATCH 18/37] use field.A for multiple dispatch of waiting functions

---
 src/update_halo.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index a64d699..19566cd 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -50,14 +50,14 @@ function _update_halo!(arrays, fields::GGField...)
             end
             for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
                 if has_neighbor(ns, dim)
-                    wait_iwrite(ns, arrays[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
+                    wait_iwrite(ns, fields[i].A, i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
                     reqs[i,ns,2] = isend_halo(ns, dim, fields[i], i);
                 end
             end
         # Copy if I am my own neighbors (when periodic boundary and only one process in this dimension).
         elseif all(neighbors(dim) .== me())
             for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
-                wait_iwrite(ns, arrays[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
+                wait_iwrite(ns, fields[i].A, i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
                 sendrecv_halo_local(ns, dim, fields[i], i);
                 nr = NNEIGHBORS_PER_DIM - ns + 1;
                 iread_recvbufs!(nr, dim, arrays[i], i);
@@ -70,7 +70,7 @@ function _update_halo!(arrays, fields::GGField...)
             if (has_neighbor(nr, dim) && neighbor(nr, dim)!=me()) iread_recvbufs!(nr, dim, arrays[i], i); end  # Note: if neighbor(nr,dim) != me() is done directly in the sendrecv_halo_local loop above for better performance (thanks to pipelining)
         end
         for nr = NNEIGHBORS_PER_DIM:-1:1,  i = 1:length(fields) # Note: if there were indeed more than 2 neighbors per dimension; then one would need to make sure which neigbour would communicate with which.
-            if has_neighbor(nr, dim) wait_iread(nr, arrays[i], i); end
+            if has_neighbor(nr, dim) wait_iread(nr, fields[i].A, i); end
         end
         for ns = 1:NNEIGHBORS_PER_DIM
             if (any(reqs[:,ns,2].!=[MPI.REQUEST_NULL])) MPI.Waitall!(reqs[:,ns,2]); end

From 94c84dd5d0e9892d9cdb9804f386774eaf52d6e9 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Tue, 3 Oct 2023 18:53:11 +0200
Subject: [PATCH 19/37] Introduce architecture-specific field types

---
 src/shared.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/shared.jl b/src/shared.jl
index cf92b4c..307edff 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -44,6 +44,9 @@ const GGArray{T,N}             = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
 const GGField{T,N}             = NamedTuple{(:A, :halowidths), Tuple{GGArray{T,N}, Tuple{GGInt,GGInt,GGInt}}}
 const GGFieldConvertible{T,N}  = NamedTuple{(:A, :halowidths), <:Tuple{GGArray{T,N}, Tuple{T2,T2,T2}}} where {T2<:Integer}
 const GGField{}(t::NamedTuple) = GGField{eltype(t.A),ndims(t.A)}((t.A, GGInt.(t.halowidths)))
+const CPUField{T,N}            = NamedTuple{(:A, :halowidths), Tuple{Array{T,N}, Tuple{GGInt,GGInt,GGInt}}}
+const CuField{T,N}             = NamedTuple{(:A, :halowidths), Tuple{CuArray{T,N}, Tuple{GGInt,GGInt,GGInt}}}
+const ROCField{T,N}            = NamedTuple{(:A, :halowidths), Tuple{ROCArray{T,N}, Tuple{GGInt,GGInt,GGInt}}}
 
 "An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests)
 struct GlobalGrid

From 7b08c9c2e8241f81340d9d9ffa7625b781688270 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Tue, 3 Oct 2023 19:01:20 +0200
Subject: [PATCH 20/37] generalization for fields of write and wait functions

---
 src/update_halo.jl | 56 +++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 19566cd..7ffc1a5 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -26,11 +26,11 @@ function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...)
     check_initialized();
     fields = wrap_field.(A);
     check_fields(fields...);
-    _update_halo!(A, fields...);  # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user).
+    _update_halo!(fields...);  # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user).
     return nothing
 end
 
-function _update_halo!(arrays, fields::GGField...)
+function _update_halo!(fields::GGField...)
     if (any_cuarray(fields...) && !cuda_enabled())    error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end    #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`.
     if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end  #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`.
     allocate_bufs(fields...);
@@ -40,7 +40,7 @@ function _update_halo!(arrays, fields::GGField...)
 
     for dim = 1:NDIMS_MPI  # NOTE: this works for 1D-3D (e.g. if nx>1, ny>1 and nz=1, then for d=3, there will be no neighbors, i.e. nothing will be done as desired...).
         for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
-            if has_neighbor(ns, dim) iwrite_sendbufs!(ns, dim, arrays[i], i); end
+            if has_neighbor(ns, dim) iwrite_sendbufs!(ns, dim, fields[i], i); end
         end
         # Send / receive if the neighbors are other processes (usual case).
         reqs = fill(MPI.REQUEST_NULL, length(fields), NNEIGHBORS_PER_DIM, 2);
@@ -50,27 +50,27 @@ function _update_halo!(arrays, fields::GGField...)
             end
             for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
                 if has_neighbor(ns, dim)
-                    wait_iwrite(ns, fields[i].A, i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
+                    wait_iwrite(ns, fields[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
                     reqs[i,ns,2] = isend_halo(ns, dim, fields[i], i);
                 end
             end
         # Copy if I am my own neighbors (when periodic boundary and only one process in this dimension).
         elseif all(neighbors(dim) .== me())
             for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
-                wait_iwrite(ns, fields[i].A, i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
+                wait_iwrite(ns, fields[i], i);  # Right before starting to send, make sure that the data of neighbor ns and field i has finished writing to the sendbuffer.
                 sendrecv_halo_local(ns, dim, fields[i], i);
                 nr = NNEIGHBORS_PER_DIM - ns + 1;
-                iread_recvbufs!(nr, dim, arrays[i], i);
+                iread_recvbufs!(nr, dim, fields[i], i);
             end
         else
             error("Incoherent neighbors in dimension $dim: either all neighbors must equal to me, or none.")
         end
         for nr = NNEIGHBORS_PER_DIM:-1:1,  i = 1:length(fields)  # Note: if there were indeed more than 2 neighbors per dimension; then one would need to make sure which neigbour would communicate with which.
             if (reqs[i,nr,1]!=MPI.REQUEST_NULL) MPI.Wait!(reqs[i,nr,1]); end
-            if (has_neighbor(nr, dim) && neighbor(nr, dim)!=me()) iread_recvbufs!(nr, dim, arrays[i], i); end  # Note: if neighbor(nr,dim) != me() is done directly in the sendrecv_halo_local loop above for better performance (thanks to pipelining)
+            if (has_neighbor(nr, dim) && neighbor(nr, dim)!=me()) iread_recvbufs!(nr, dim, fields[i], i); end  # Note: if neighbor(nr,dim) != me() is done directly in the sendrecv_halo_local loop above for better performance (thanks to pipelining)
         end
         for nr = NNEIGHBORS_PER_DIM:-1:1,  i = 1:length(fields) # Note: if there were indeed more than 2 neighbors per dimension; then one would need to make sure which neigbour would communicate with which.
-            if has_neighbor(nr, dim) wait_iread(nr, fields[i].A, i); end
+            if has_neighbor(nr, dim) wait_iread(nr, fields[i], i); end
         end
         for ns = 1:NNEIGHBORS_PER_DIM
             if (any(reqs[:,ns,2].!=[MPI.REQUEST_NULL])) MPI.Waitall!(reqs[:,ns,2]); end
@@ -358,15 +358,16 @@ let
 
     tasks = Array{Task}(undef, NNEIGHBORS_PER_DIM, 0);
 
-    wait_iwrite(n::Integer, A::Array{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) # The argument A is used for multiple dispatch. #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance as the tasks are mapped only to one thread via context switching).
+    wait_iwrite(n::Integer, A::CPUField{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) # The argument A is used for multiple dispatch. #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance as the tasks are mapped only to one thread via context switching).
 
     function allocate_tasks_iwrite(fields::GGField...)
-        if length(fields) > size(tasks,2)  # Note: for simplicity, we create a tasks for every field even if it is not an Array
+        if length(fields) > size(tasks,2)  # Note: for simplicity, we create a tasks for every field even if it is not an CPUField
             tasks = [tasks Array{Task}(undef, NNEIGHBORS_PER_DIM, length(fields)-size(tasks,2))];  # Create (additional) emtpy tasks.
         end
     end
 
-    function iwrite_sendbufs!(n::Integer, dim::Integer, A::Array{T}, i::Integer) where T <: GGNumber  # Function to be called if A is a CPU array.
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::CPUField{T}, i::Integer) where T <: GGNumber  # Function to be called if A is a CPUField.
+        A, halowidths = F;
         tasks[n,i] = @task begin
             if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
                 write_h2h!(sendbuf(n,dim,i,A), A, sendranges(n,dim,A), dim);
@@ -384,7 +385,7 @@ let
 
     tasks = Array{Task}(undef, NNEIGHBORS_PER_DIM, 0);
 
-    wait_iread(n::Integer, A::Array{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance currently as the tasks are mapped only to one thread via context switching).
+    wait_iread(n::Integer, A::CPUField{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance currently as the tasks are mapped only to one thread via context switching).
 
     function allocate_tasks_iread(fields::GGField...)
         if length(fields) > size(tasks,2)  # Note: for simplicity, we create a tasks for every field even if it is not an Array
@@ -392,7 +393,8 @@ let
         end
     end
 
-    function iread_recvbufs!(n::Integer, dim::Integer, A::Array{T}, i::Integer) where T <: GGNumber
+    function iread_recvbufs!(n::Integer, dim::Integer, F::CPUField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
         tasks[n,i] = @task begin
             if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
                 read_h2h!(recvbuf(n,dim,i,A), A, recvranges(n,dim,A), dim);
@@ -418,15 +420,16 @@ let
 
     custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    wait_iwrite(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
+    wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
 
     function allocate_custreams_iwrite(fields::GGField...)
-        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
+        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
             custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
     end
 
-    function iwrite_sendbufs!(n::Integer, dim::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
                 ranges = sendranges(n, dim, A);
@@ -446,15 +449,16 @@ let
 
     custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    wait_iread(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
+    wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
 
     function allocate_custreams_iread(fields::GGField...)
-        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
+        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
             custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
     end
 
-    function iread_recvbufs!(n::Integer, dim::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber
+    function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             if dim == 1 || cudaaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
                 ranges = recvranges(n, dim, A);
@@ -482,15 +486,16 @@ let
 
     rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    wait_iwrite(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
+    wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
 
     function allocate_rocstreams_iwrite(fields::GGField...)
-        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCArray
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
             rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
     end
 
-    function iwrite_sendbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
             # DEBUG 2: commenting read_h2d_async! for now
@@ -512,15 +517,16 @@ let
 
     rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    wait_iread(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
+    wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
 
     function allocate_rocstreams_iread(fields::GGField...)
-        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCArray
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
             rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
     end
 
-    function iread_recvbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
+    function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
             # DEBUG 2: commenting read_h2d_async! for now

From 78f2d1a50e65b95548c20288b7565f891fbb0163 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 5 Oct 2023 15:04:46 +0200
Subject: [PATCH 21/37] enhance field types and make halosize allways 3D

---
 src/shared.jl | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/shared.jl b/src/shared.jl
index 307edff..f2d2b3d 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -38,15 +38,22 @@ const DEVICE_TYPE_AMDGPU = "AMDGPU"
 ##------
 ## TYPES
 
-const GGInt                    = Cint
-const GGNumber                 = Number
-const GGArray{T,N}             = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
-const GGField{T,N}             = NamedTuple{(:A, :halowidths), Tuple{GGArray{T,N}, Tuple{GGInt,GGInt,GGInt}}}
-const GGFieldConvertible{T,N}  = NamedTuple{(:A, :halowidths), <:Tuple{GGArray{T,N}, Tuple{T2,T2,T2}}} where {T2<:Integer}
-const GGField{}(t::NamedTuple) = GGField{eltype(t.A),ndims(t.A)}((t.A, GGInt.(t.halowidths)))
-const CPUField{T,N}            = NamedTuple{(:A, :halowidths), Tuple{Array{T,N}, Tuple{GGInt,GGInt,GGInt}}}
-const CuField{T,N}             = NamedTuple{(:A, :halowidths), Tuple{CuArray{T,N}, Tuple{GGInt,GGInt,GGInt}}}
-const ROCField{T,N}            = NamedTuple{(:A, :halowidths), Tuple{ROCArray{T,N}, Tuple{GGInt,GGInt,GGInt}}}
+const GGInt                           = Cint
+const GGNumber                        = Number
+const GGArray{T,N}                    = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
+const GGField{T,N,T_array}            = NamedTuple{(:A, :halowidths), Tuple{T_array, Tuple{GGInt,GGInt,GGInt}}} where {T_array<:GGArray{T,N}}
+const GGFieldConvertible{T,N,T_array} = NamedTuple{(:A, :halowidths), <:Tuple{T_array, Tuple{T2,T2,T2}}} where {T_array<:GGArray{T,N}, T2<:Integer}
+const GGField{}(t::NamedTuple)        = GGField{eltype(t.A),ndims(t.A),typeof(t.A)}((t.A, GGInt.(t.halowidths)))
+const CPUField{T,N}                   = GGField{T,N,Array{T,N}}
+const CuField{T,N}                    = GGField{T,N,CuArray{T,N}}
+const ROCField{T,N}                   = GGField{T,N,ROCArray{T,N}}
+
+Base.size(A::Union{GGField, CPUField, CuField, ROCField})          = Base.size(A.A)
+Base.size(A::Union{GGField, CPUField, CuField, ROCField}, args...) = Base.size(A.A, args...)
+Base.length(A::Union{GGField, CPUField, CuField, ROCField})        = Base.length(A.A)
+Base.ndims(A::Union{GGField, CPUField, CuField, ROCField})         = Base.ndims(A.A)
+Base.eltype(A::Union{GGField, CPUField, CuField, ROCField})        = Base.eltype(A.A)
+
 
 "An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests)
 struct GlobalGrid

From 116bce309ddfef5c09aa1d10a2ce92136d57b2f1 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 5 Oct 2023 15:16:06 +0200
Subject: [PATCH 22/37] generalization for fields in buffer creation, writing
 and send and receive

---
 src/update_halo.jl | 130 +++++++++++++++++++++------------------------
 1 file changed, 61 insertions(+), 69 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 7ffc1a5..e8936d1 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -82,7 +82,7 @@ end
 ##---------------------------
 ## FUNCTIONS FOR SYNTAX SUGAR
 
-halosize(dim::Integer, A::GGArray) = (ndims(A)>1) ? size(A)[1:ndims(A).!=dim] : (1,);
+halosize(dim::Integer, A::GGField) = (dim==1) ? (A.halowidths[1], size(A,2), size(A,3)) : ((dim==2) ? (size(A,1), A.halowidths[2], size(A,3)) : (size(A,1), size(A,2), A.halowidths[3]))
 
 
 ##---------------------------------------
@@ -278,41 +278,41 @@ let
 
     # (CPU functions)
 
-    function sendbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGArray{T}) where T <: GGNumber
+    function sendbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGField{T}) where T <: GGNumber
         return view(sendbufs_raw[i][n]::AbstractVector{T},1:prod(halosize(dim,A)));
     end
 
-    function recvbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGArray{T}) where T <: GGNumber
+    function recvbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGField{T}) where T <: GGNumber
         return view(recvbufs_raw[i][n]::AbstractVector{T},1:prod(halosize(dim,A)));
     end
 
-    function sendbuf(n::Integer, dim::Integer, i::Integer, A::GGArray)
+    function sendbuf(n::Integer, dim::Integer, i::Integer, A::GGField)
         return reshape(sendbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
-    function recvbuf(n::Integer, dim::Integer, i::Integer, A::GGArray)
+    function recvbuf(n::Integer, dim::Integer, i::Integer, A::GGField)
         return reshape(recvbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
 
     # (CUDA functions)
 
-    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuArray{T}) where T <: GGNumber
+    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber
         return view(cusendbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A)));
     end
 
-    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuArray{T}) where T <: GGNumber
+    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber
         return view(curecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A)));
     end
 
 
     # (AMDGPU functions)
 
-    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCArray{T}) where T <: GGNumber
+    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber
         return view(rocsendbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A)));
     end
 
-    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCArray{T}) where T <: GGNumber
+    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber
         return view(rocrecvbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A)));
     end
 
@@ -320,11 +320,11 @@ let
     # (GPU functions)
 
     #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber  and  GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others.
-    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuArray{T}, ROCArray{T}}) where T <: GGNumber
+    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber
         return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
-    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuArray{T}, ROCArray{T}}) where T <: GGNumber
+    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber
         return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
@@ -363,14 +363,14 @@ let
     function allocate_tasks_iwrite(fields::GGField...)
         if length(fields) > size(tasks,2)  # Note: for simplicity, we create a tasks for every field even if it is not an CPUField
             tasks = [tasks Array{Task}(undef, NNEIGHBORS_PER_DIM, length(fields)-size(tasks,2))];  # Create (additional) emtpy tasks.
-        end
+        end 
     end
 
     function iwrite_sendbufs!(n::Integer, dim::Integer, F::CPUField{T}, i::Integer) where T <: GGNumber  # Function to be called if A is a CPUField.
         A, halowidths = F;
         tasks[n,i] = @task begin
-            if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
-                write_h2h!(sendbuf(n,dim,i,A), A, sendranges(n,dim,A), dim);
+            if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+                write_h2h!(sendbuf(n,dim,i,F), A, sendranges(n,dim,F), dim);
             end
         end
     end
@@ -396,8 +396,8 @@ let
     function iread_recvbufs!(n::Integer, dim::Integer, F::CPUField{T}, i::Integer) where T <: GGNumber
         A, halowidths = F;
         tasks[n,i] = @task begin
-            if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
-                read_h2h!(recvbuf(n,dim,i,A), A, recvranges(n,dim,A), dim);
+            if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+                read_h2h!(recvbuf(n,dim,i,F), A, recvranges(n,dim,F), dim);
             end
         end
     end
@@ -430,15 +430,15 @@ let
 
     function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
         A, halowidths = F;
-        if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
             if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = sendranges(n, dim, A);
+                ranges = sendranges(n, dim, F);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
+                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
             else
-                write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), custreams[n,i]);
+                write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]);
             end
         end
     end
@@ -459,15 +459,15 @@ let
 
     function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
         A, halowidths = F;
-        if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
             if dim == 1 || cudaaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = recvranges(n, dim, A);
+                ranges = recvranges(n, dim, F);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
+                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
             else
-                read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), custreams[n,i]);
+                read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]);
             end
         end
     end
@@ -496,17 +496,17 @@ let
 
     function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
         A, halowidths = F;
-        if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
             # DEBUG: the follow section needs perf testing
             # DEBUG 2: commenting read_h2d_async! for now
             # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = sendranges(n, dim, A);
+                ranges = sendranges(n, dim, F);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
             # else
-            #     write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
+            #     write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]);
             # end
         end
     end
@@ -527,17 +527,17 @@ let
 
     function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
         A, halowidths = F;
-        if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
             # DEBUG: the follow section needs perf testing
             # DEBUG 2: commenting read_h2d_async! for now
             # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = recvranges(n, dim, A);
+                ranges = recvranges(n, dim, F);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
             # else
-            #     read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
+            #     read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]);
             # end
         end
     end
@@ -548,24 +548,26 @@ end
 # (CPU/GPU functions)
 
 # Return the ranges from A to be sent. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1).
-function sendranges(n::Integer, dim::Integer, A::GGArray)
-    if (ol(dim, A) < 2) error("Incoherent arguments: ol(A,dim)<2."); end
+function sendranges(n::Integer, dim::Integer, F::GGField)
+    A, halowidths = F;
+    if (ol(dim, A) < 2*halowidths[dim]) error("Incoherent arguments: ol(A,dim)<2*halowidths[dim]."); end
     if     (n==2) ixyz_dim = size(A, dim) - (ol(dim, A) - 1);
-    elseif (n==1) ixyz_dim = 1            + (ol(dim, A) - 1);
+    elseif (n==1) ixyz_dim = 1            + (ol(dim, A) - halowidths[dim]);
     end
     sendranges      = [1:size(A,1), 1:size(A,2), 1:size(A,3)];  # Initialize with the ranges of A.
-    sendranges[dim] = ixyz_dim:ixyz_dim;
+    sendranges[dim] = ixyz_dim:ixyz_dim+halowidths[dim]-1;
     return sendranges
 end
 
 # Return the ranges from A to be received. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1).
-function recvranges(n::Integer, dim::Integer, A::GGArray)
-    if (ol(dim, A) < 2) error("Incoherent arguments: ol(A,dim)<2."); end
-    if     (n==2) ixyz_dim = size(A, dim);
+function recvranges(n::Integer, dim::Integer, F::GGField)
+    A, halowidths = F;
+    if (ol(dim, A) < 2*halowidths[dim]) error("Incoherent arguments: ol(A,dim)<2*halowidths[dim]."); end
+    if     (n==2) ixyz_dim = size(A, dim) - (halowidths[dim] - 1);
     elseif (n==1) ixyz_dim = 1;
     end
     recvranges      = [1:size(A,1), 1:size(A,2), 1:size(A,3)];  # Initialize with the ranges of A.
-    recvranges[dim] = ixyz_dim:ixyz_dim;
+    recvranges[dim] = ixyz_dim:ixyz_dim+halowidths[dim]-1;
     return recvranges
 end
 
@@ -611,10 +613,7 @@ function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrange
     iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1
     iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1
     if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
-    if     (dim == 1) gpusendbuf[iy,iz] = A[ix,iy,iz];
-    elseif (dim == 2) gpusendbuf[ix,iz] = A[ix,iy,iz];
-    elseif (dim == 3) gpusendbuf[ix,iy] = A[ix,iy,iz];
-    end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
     return nothing
 end
 
@@ -624,10 +623,7 @@ function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex
     iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1
     iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1
     if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
-    if     (dim == 1) A[ix,iy,iz] = gpurecvbuf[iy,iz];
-    elseif (dim == 2) A[ix,iy,iz] = gpurecvbuf[ix,iz];
-    elseif (dim == 3) A[ix,iy,iz] = gpurecvbuf[ix,iy];
-    end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
     return nothing
 end
 
@@ -664,10 +660,7 @@ function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendran
     iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1
     iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1
     if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
-    if     (dim == 1) gpusendbuf[iy,iz] = A[ix,iy,iz];
-    elseif (dim == 2) gpusendbuf[ix,iz] = A[ix,iy,iz];
-    elseif (dim == 3) gpusendbuf[ix,iy] = A[ix,iy,iz];
-    end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
     return nothing
 end
 
@@ -677,10 +670,7 @@ function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrang
     iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1
     iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1
     if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
-    if     (dim == 1) A[ix,iy,iz] = gpurecvbuf[iy,iz];
-    elseif (dim == 2) A[ix,iy,iz] = gpurecvbuf[ix,iz];
-    elseif (dim == 3) A[ix,iy,iz] = gpurecvbuf[ix,iy];
-    end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
     return nothing
 end
 
@@ -719,12 +709,12 @@ end
 
 function irecv_halo!(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Integer=0)
     req = MPI.REQUEST_NULL;
-    A, halowidths = F.A, F.halowidths;
+    A, halowidths = F;
     if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
         if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
-            req = MPI.Irecv!(gpurecvbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm());
+            req = MPI.Irecv!(gpurecvbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm());
         else
-            req = MPI.Irecv!(recvbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm());
+            req = MPI.Irecv!(recvbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm());
         end
     end
     return req
@@ -732,31 +722,31 @@ end
 
 function isend_halo(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Integer=0)
     req = MPI.REQUEST_NULL;
-    A, halowidths = F.A, F.halowidths;
+    A, halowidths = F;
     if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
         if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
-            req = MPI.Isend(gpusendbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm());
+            req = MPI.Isend(gpusendbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm());
         else
-            req = MPI.Isend(sendbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm());
+            req = MPI.Isend(sendbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm());
         end
     end
     return req
 end
 
 function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer)
-    A, halowidths = F.A, F.halowidths;
+    A, halowidths = F;
     if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
         if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
             if n == 1
-                gpumemcopy!(gpurecvbuf_flat(2,dim,i,A), gpusendbuf_flat(1,dim,i,A));
+                gpumemcopy!(gpurecvbuf_flat(2,dim,i,F), gpusendbuf_flat(1,dim,i,F));
             elseif n == 2
-                gpumemcopy!(gpurecvbuf_flat(1,dim,i,A), gpusendbuf_flat(2,dim,i,A));
+                gpumemcopy!(gpurecvbuf_flat(1,dim,i,F), gpusendbuf_flat(2,dim,i,F));
             end
         else
             if n == 1
-                memcopy!(recvbuf_flat(2,dim,i,A), sendbuf_flat(1,dim,i,A), loopvectorization(dim));
+                memcopy!(recvbuf_flat(2,dim,i,F), sendbuf_flat(1,dim,i,F), loopvectorization(dim));
             elseif n == 2
-                memcopy!(recvbuf_flat(1,dim,i,A), sendbuf_flat(2,dim,i,A), loopvectorization(dim));
+                memcopy!(recvbuf_flat(1,dim,i,F), sendbuf_flat(2,dim,i,F), loopvectorization(dim));
             end
         end
     end
@@ -812,10 +802,12 @@ end
 ## FUNCTIONS FOR WRAPPING AND CHECKING THE INPUT ARGUMENTS
 
 
-# Wrap the input argument into a GGField.
+# Wrap the input argument into a GGField/CPUField/CuField/ROCField.
 wrap_field(A::GGField)                 = A
 wrap_field(A::GGFieldConvertible)      = GGField(A)
-wrap_field(A::GGArray, hw::Tuple)      = GGField{eltype(A), ndims(A)}((A, hw))
+wrap_field(A::Array, hw::Tuple)        = CPUField{eltype(A), ndims(A)}((A, hw))
+wrap_field(A::CuArray, hw::Tuple)      = CuField{eltype(A), ndims(A)}((A, hw))
+wrap_field(A::ROCArray, hw::Tuple)     = ROCField{eltype(A), ndims(A)}((A, hw))
 wrap_field(A::GGArray, hw::Integer...) = wrap_field(A, hw)
 wrap_field(A::GGArray)                 = wrap_field(A, hw_default()...)
 

From dc61f9208b6df1bac8b5a55511028a6430efb36c Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 5 Oct 2023 18:12:01 +0200
Subject: [PATCH 23/37] generalize halo update unit tests for fields

---
 test/test_update_halo.jl | 166 ++++++++++++++++++++-------------------
 1 file changed, 87 insertions(+), 79 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index 22118d5..e7e198a 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -87,10 +87,11 @@ dz = 1.0
         C = zeros(Float32,    nx+1, ny+1, nz+1);
         Z = zeros(ComplexF16, nx,   ny,   nz  );
         Y = zeros(ComplexF16, nx-1, ny+2, nz+1);
+        P, A, B, C, Z, Y = GG.wrap_field.((P, A, B, C, Z, Y));
         @testset "free buffers" begin
             @require GG.get_sendbufs_raw() === nothing
             @require GG.get_recvbufs_raw() === nothing
-            GG.allocate_bufs(GG.wrap_field(P));
+            GG.allocate_bufs(P);
             @require GG.get_sendbufs_raw() !== nothing
             @require GG.get_recvbufs_raw() !== nothing
             GG.free_update_halo_buffers();
@@ -99,7 +100,7 @@ dz = 1.0
         end;
         @testset "allocate single" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(GG.wrap_field(P));
+            GG.allocate_bufs(P);
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 1                                    # 1 array
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -110,7 +111,7 @@ dz = 1.0
         end;
         @testset "allocate single (Complex)" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(GG.wrap_field(Z));
+            GG.allocate_bufs(Z);
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 1                                    # 1 array
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -121,8 +122,8 @@ dz = 1.0
         end;
         @testset "keep 1st, allocate 2nd" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(GG.wrap_field(P));
-            GG.allocate_bufs(GG.wrap_field.((A, P))...);
+            GG.allocate_bufs(P);
+            GG.allocate_bufs(A, P);
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 2                                    # 2 arrays
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -135,8 +136,8 @@ dz = 1.0
         end;
         @testset "keep 1st, allocate 2nd (Complex)" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(GG.wrap_field(Z));
-            GG.allocate_bufs(GG.wrap_field.((Y, Z))...);
+            GG.allocate_bufs(Z);
+            GG.allocate_bufs(Y, Z);
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 2                                    # 2 arrays
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -149,8 +150,8 @@ dz = 1.0
         end;
         @testset "reinterpret (no allocation)" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(GG.wrap_field.((A, P))...);
-            GG.allocate_bufs(GG.wrap_field.((B, C))...);                                                  # The new arrays contain Float32 (A, and P were Float64); B and C have a halo with more elements than A and P had, but they require less space in memory
+            GG.allocate_bufs(A, P);
+            GG.allocate_bufs(B, C);                                                  # The new arrays contain Float32 (A, and P were Float64); B and C have a halo with more elements than A and P had, but they require less space in memory
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 2                                    # Still 2 arrays: B, C (even though they are different then before: was A and P)
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -164,8 +165,8 @@ dz = 1.0
         end;
         @testset "reinterpret (no allocation) (Complex)" begin
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(GG.wrap_field.((A, P))...);
-            GG.allocate_bufs(GG.wrap_field.((Y, Z))...);                                                  # The new arrays contain Float32 (A, and P were Float64); B and C have a halo with more elements than A and P had, but they require less space in memory
+            GG.allocate_bufs(A, P);
+            GG.allocate_bufs(Y, Z);                                                  # The new arrays contain Float32 (A, and P were Float64); B and C have a halo with more elements than A and P had, but they require less space in memory
             for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
                 @test length(bufs_raw)       == 2                                    # Still 2 arrays: B, C (even though they are different then before: was A and P)
                 @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
@@ -183,14 +184,14 @@ dz = 1.0
                 sendbuf, recvbuf = (GG.gpusendbuf, GG.gpurecvbuf);
             end
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(GG.wrap_field.((A, P))...);
+            GG.allocate_bufs(A, P);
             for dim = 1:ndims(A), n = 1:nneighbors_per_dim
-                @test all(size(sendbuf(n,dim,1,A)) .== size(A)[1:ndims(A).!=dim])
-                @test all(size(recvbuf(n,dim,1,A)) .== size(A)[1:ndims(A).!=dim])
+                @test all(length(sendbuf(n,dim,1,A)) .== prod(size(A)[1:ndims(A).!=dim]))
+                @test all(length(recvbuf(n,dim,1,A)) .== prod(size(A)[1:ndims(A).!=dim]))
             end
             for dim = 1:ndims(P), n = 1:nneighbors_per_dim
-                @test all(size(sendbuf(n,dim,2,P)) .== size(P)[1:ndims(P).!=dim])
-                @test all(size(recvbuf(n,dim,2,P)) .== size(P)[1:ndims(P).!=dim])
+                @test all(length(sendbuf(n,dim,2,P)) .== prod(size(P)[1:ndims(P).!=dim]))
+                @test all(length(recvbuf(n,dim,2,P)) .== prod(size(P)[1:ndims(P).!=dim]))
             end
         end;
         @testset "(cu/roc)sendbuf / (cu/roc)recvbuf (Complex)" begin
@@ -199,14 +200,14 @@ dz = 1.0
                 sendbuf, recvbuf = (GG.gpusendbuf, GG.gpurecvbuf);
             end
             GG.free_update_halo_buffers();
-            GG.allocate_bufs(GG.wrap_field.((Y, Z))...);
+            GG.allocate_bufs(Y, Z);
             for dim = 1:ndims(Y), n = 1:nneighbors_per_dim
-                @test all(size(sendbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
-                @test all(size(recvbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
+                @test all(length(sendbuf(n,dim,1,Y)) .== prod(size(Y)[1:ndims(Y).!=dim]))
+                @test all(length(recvbuf(n,dim,1,Y)) .== prod(size(Y)[1:ndims(Y).!=dim]))
             end
             for dim = 1:ndims(Z), n = 1:nneighbors_per_dim
-                @test all(size(sendbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
-                @test all(size(recvbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
+                @test all(length(sendbuf(n,dim,2,Z)) .== prod(size(Z)[1:ndims(Z).!=dim]))
+                @test all(length(recvbuf(n,dim,2,Z)) .== prod(size(Z)[1:ndims(Z).!=dim]))
             end
         end;
         finalize_global_grid(finalize_MPI=false);
@@ -218,6 +219,7 @@ dz = 1.0
                 init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                 P   = zeros(nx,  ny,  nz  );
                 A   = zeros(nx-1,ny+2,nz+1);
+                P, A = GG.wrap_field.((P, A));
                 @test GG.sendranges(1, 1, P) == [                    2:2,             1:size(P,2),             1:size(P,3)]
                 @test GG.sendranges(2, 1, P) == [size(P,1)-1:size(P,1)-1,             1:size(P,2),             1:size(P,3)]
                 @test GG.sendranges(1, 2, P) == [            1:size(P,1),                     2:2,             1:size(P,3)]
@@ -249,22 +251,23 @@ dz = 1.0
                 P  = zeros(nx,  ny,  nz  );
                 P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
                 P2 = zeros(size(P));
+                halowidths = (1,1,1)
                 # (dim=1)
-                buf = zeros(size(P,2), size(P,3));
+                buf = zeros(halowidths[1], size(P,2), size(P,3));
                 ranges = [2:2, 1:size(P,2), 1:size(P,3)];
                 GG.write_h2h!(buf, P, ranges, 1);
                 @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
                 GG.read_h2h!(buf, P2, ranges, 1);
                 @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
                 # (dim=2)
-                buf = zeros(size(P,1), size(P,3));
+                buf = zeros(size(P,1), halowidths[2], size(P,3));
                 ranges = [1:size(P,1), 3:3, 1:size(P,3)];
                 GG.write_h2h!(buf, P, ranges, 2);
                 @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
                 GG.read_h2h!(buf, P2, ranges, 2);
                 @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
                 # (dim=3)
-                buf = zeros(size(P,1), size(P,2));
+                buf = zeros(size(P,1), size(P,2), halowidths[3]);
                 ranges = [1:size(P,1), 1:size(P,2), 4:4];
                 GG.write_h2h!(buf, P, ranges, 3);
                 @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
@@ -278,11 +281,12 @@ dz = 1.0
                     P  = zeros(nx,  ny,  nz  );
                     P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
                     P  = GPUArray(P);
+                    halowidths = (1,1,1)
                     if array_type == "CUDA"
                         # (dim=1)
                         dim = 1;
                         P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,2), size(P,3));
+                        buf = zeros(halowidths[dim], size(P,2), size(P,3));
                         buf_d, buf_h = GG.register(CuArray,buf);
                         ranges = [2:2, 1:size(P,2), 1:size(P,3)];
                         nthreads = (1, 1, 1);
@@ -303,7 +307,7 @@ dz = 1.0
                         # (dim=2)
                         dim = 2;
                         P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,1), size(P,3));
+                        buf = zeros(size(P,1), halowidths[dim], size(P,3));
                         buf_d, buf_h = GG.register(CuArray,buf);
                         ranges = [1:size(P,1), 3:3, 1:size(P,3)];
                         nthreads = (1, 1, 1);
@@ -324,7 +328,7 @@ dz = 1.0
                         # (dim=3)
                         dim = 3
                         P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,1), size(P,2));
+                        buf = zeros(size(P,1), size(P,2), halowidths[dim]);
                         buf_d, buf_h = GG.register(CuArray,buf);
                         ranges = [1:size(P,1), 1:size(P,2), 4:4];
                         nthreads = (1, 1, 1);
@@ -346,7 +350,7 @@ dz = 1.0
                         # (dim=1)
                         dim = 1;
                         P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,2), size(P,3));
+                        buf = zeros(halowidths[dim], size(P,2), size(P,3));
                         buf_d = GG.register(ROCArray,buf);
                         ranges = [2:2, 1:size(P,2), 1:size(P,3)];
                         nthreads = (1, 1, 1);
@@ -367,7 +371,7 @@ dz = 1.0
                         # (dim=2)
                         dim = 2;
                         P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,1), size(P,3));
+                        buf = zeros(size(P,1), halowidths[dim], size(P,3));
                         buf_d = GG.register(ROCArray,buf);
                         ranges = [1:size(P,1), 3:3, 1:size(P,3)];
                         nthreads = (1, 1, 1);
@@ -388,7 +392,7 @@ dz = 1.0
                         # (dim=3)
                         dim = 3
                         P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,1), size(P,2));
+                        buf = zeros(size(P,1), size(P,2), halowidths[dim]);
                         buf_d = GG.register(ROCArray,buf);
                         ranges = [1:size(P,1), 1:size(P,2), 4:4];
                         nthreads = (1, 1, 1);
@@ -396,7 +400,7 @@ dz = 1.0
                         nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                         @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+                         @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         # buf .= 0.0; # DEBUG: diabling read_x2x_async! tests for now in AMDGPU backend because there is an issue most likely in HIP
                         # P2  .= 0.0;
@@ -416,7 +420,8 @@ dz = 1.0
                 A = zeros(nx-1,ny+2,nz+1);
                 P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]);
                 A .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]);
-                GG.allocate_bufs(GG.wrap_field.((P, A))...);
+                P, A = GG.wrap_field.((P, A));
+                GG.allocate_bufs(P, A);
                 if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
                 elseif (array_type == "AMDGPU") GG.allocate_rocstreams(P, A);
                 else                            GG.allocate_tasks(P, A);
@@ -428,10 +433,10 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[2,:,:][:]))) # DEBUG: here and later, CPUArray is needed to avoid error in AMDGPU because of mapreduce
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[2,:,:][:]))) # DEBUG: here and later, CPUArray is needed to avoid error in AMDGPU because of mapreduce
                     @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0))
                 else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[2,:,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[2,:,:][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
                 end
                 n = 2
@@ -440,10 +445,10 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[end-1,:,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[end-1,:,:][:])))
                     @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0))
                 else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[end-1,:,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[end-1,:,:][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
                 end
                 dim = 2
@@ -453,11 +458,11 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,2,:][:])))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,4,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[:,2,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A.A[:,4,:][:])))
                 else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,2,:][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,4,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,2,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,4,:][:]))
                 end
                 n = 2
                 GG.iwrite_sendbufs!(n, dim, P, 1);
@@ -465,11 +470,11 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,end-1,:][:])))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,end-3,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[:,end-1,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A.A[:,end-3,:][:])))
                 else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,end-1,:][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,end-3,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,end-1,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,end-3,:][:]))
                 end
                 dim = 3
                 n = 1
@@ -478,11 +483,11 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,3][:])))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,4][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[:,:,3][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A.A[:,:,4][:])))
                 else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,3][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,4][:]))
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,:,3][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,:,4][:]))
                 end
                 n = 2
                 GG.iwrite_sendbufs!(n, dim, P, 1);
@@ -490,11 +495,11 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,end-2][:])))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,end-3][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[:,:,end-2][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A.A[:,:,end-3][:])))
                 else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end-2][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end-3][:]))
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,:,end-2][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,:,end-3][:]))
                 end
                 finalize_global_grid(finalize_MPI=false);
             end;
@@ -502,7 +507,8 @@ dz = 1.0
                 init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                 P = zeros(nx,  ny,  nz  );
                 A = zeros(nx-1,ny+2,nz+1);
-                GG.allocate_bufs(GG.wrap_field.((P, A))...);
+                P, A = GG.wrap_field.((P, A));
+                GG.allocate_bufs(P, A);
                 if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
                 elseif (array_type == "AMDGPU") GG.allocate_rocstreams(P, A);
                 else                            GG.allocate_tasks(P, A);
@@ -523,11 +529,11 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[1,:,:][:])))
-                    @test all(CPUArray(                          0.0 .== Array(A[1,:,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[1,:,:][:])))
+                    @test all(CPUArray(                          0.0 .== Array(A.A[1,:,:][:])))
                 else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[1,:,:][:]))
-                    @test all(                       0.0 .== CPUArray(A[1,:,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[1,:,:][:]))
+                    @test all(                       0.0 .== CPUArray(A.A[1,:,:][:]))
                 end
                 n = 2
                 GG.iread_recvbufs!(n, dim, P, 1);
@@ -535,11 +541,11 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[end,:,:][:])))
-                    @test all(CPUArray(                          0.0 .== Array(A[end,:,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[end,:,:][:])))
+                    @test all(CPUArray(                          0.0 .== Array(A.A[end,:,:][:])))
                 else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[end,:,:][:]))
-                    @test all(                       0.0 .== CPUArray(A[end,:,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[end,:,:][:]))
+                    @test all(                       0.0 .== CPUArray(A.A[end,:,:][:]))
                 end
                 dim = 2
                 for n = 1:nneighbors_per_dim
@@ -557,11 +563,11 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,1,:][:])))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,1,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[:,1,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A.A[:,1,:][:])))
                 else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,1,:][:]))
-                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,1,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,1,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,1,:][:]))
                 end
                 n = 2
                 GG.iread_recvbufs!(n, dim, P, 1);
@@ -569,11 +575,11 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,end,:][:])))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,end,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[:,end,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A.A[:,end,:][:])))
                 else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,end,:][:]))
-                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,end,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,end,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,end,:][:]))
                 end
                 dim = 3
                 for n = 1:nneighbors_per_dim
@@ -591,11 +597,11 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,1][:])))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,1][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[:,:,1][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A.A[:,:,1][:])))
                 else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,1][:]))
-                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,1][:]))
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,:,1][:]))
+                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,:,1][:]))
                 end
                 n = 2
                 GG.iread_recvbufs!(n, dim, P, 1);
@@ -603,11 +609,11 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,end][:])))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,end][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[:,:,end][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A.A[:,:,end][:])))
                 else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end][:]))
-                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end][:]))
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,:,end][:]))
+                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,:,end][:]))
                 end
                 finalize_global_grid(finalize_MPI=false);
             end;
@@ -616,7 +622,8 @@ dz = 1.0
                     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
                     P = zeros(nx,  ny,  nz  );
                     A = zeros(nx-1,ny+2,nz+1);
-                    GG.allocate_bufs(GG.wrap_field.((P, A))...);
+                    P, A = GG.wrap_field.((P, A));
+                    GG.allocate_bufs(P, A);
                     dim = 1
                     for n = 1:nneighbors_per_dim
                         if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
@@ -701,8 +708,9 @@ dz = 1.0
                 me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, quiet=true, init_MPI=false, device_type=device_type);
                 P   = zeros(nx,ny,nz);
                 A   = zeros(nx,ny,nz);
+                P, A = GG.wrap_field.((P, A));
                 dim = 1;
-                GG.allocate_bufs(GG.wrap_field.((P, A))...);
+                GG.allocate_bufs(P, A);
                 for n = 1:nneighbors_per_dim
                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
                         GG.gpusendbuf(n,dim,1,P) .= 9.0;

From c4c2d0177383e5b06d9cea680edc51430d599501 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 11:19:45 +0200
Subject: [PATCH 24/37] adjust halowidths default and parameter error messages

---
 src/init_global_grid.jl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index 102d3f4..de47075 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -13,7 +13,7 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
 - `quiet::Bool=false`: whether to suppress printing information like the size of the global grid (`true`) or not (`false`).
 !!! note "Advanced keyword arguments"
     - `overlaps::Tuple{Int,Int,Int}=(2,2,2)`: the number of elements adjacent local grids overlap in dimension x, y and z. By default (value `(2,2,2)`), an array `A` of size (`nx`, `ny`, `nz`) on process 1 (`A_1`) overlaps the corresponding array `A` on process 2 (`A_2`) by `2` indices if the two processes are adjacent. E.g., if `overlaps[1]=2` and process 2 is the right neighbor of process 1 in dimension x, then `A_1[end-1:end,:,:]` overlaps `A_2[1:2,:,:]`. That means, after every call `update_halo!(A)`, we have `all(A_1[end-1:end,:,:] .== A_2[1:2,:,:])` (`A_1[end,:,:]` is the halo of process 1 and `A_2[1,:,:]` is the halo of process 2). The analog applies for the dimensions y and z.
-    - `halowidths::Tuple{Int,Int,Int}=overlaps.÷2`: the default width of an array's halo in dimension x, y and z. The default can be overwritten per array in the function [`update_halo`](@ref).
+    - `halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2)`: the default width of an array's halo in dimension x, y and z (must be greater than 1). The default can be overwritten per array in the function [`update_halo`](@ref).
     - `disp::Integer=1`:  the displacement argument to `MPI.Cart_shift` in order to determine the neighbors.
     - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology.
     - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology.
@@ -38,7 +38,7 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
 
 See also: [`finalize_global_grid`](@ref), [`select_device`](@ref)
 """
-function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=overlaps.÷2, disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
+function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
     if grid_is_initialized() error("The global grid has already been initialized.") end
     nxyz              = [nx, ny, nz];
     dims              = [dimx, dimy, dimz];
@@ -72,11 +72,15 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
     if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
     if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_functional()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
     if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+    if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
+    if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end
+    if (any(periods .∉ ((0,1),))) error("Invalid arguments: periodx, periody, and periodz must be either 0 or 1."); end
+    if (any(halowidths .< 1)) error("Invalid arguments: halowidths cannot be less than 1."); end
     if (nx==1) error("Invalid arguments: nx can never be 1.") end
     if (ny==1 && nz>1) error("Invalid arguments: ny cannot be 1 if nz is greater than 1.") end
     if (any((nxyz .== 1) .& (dims .>1 ))) error("Incoherent arguments: if nx, ny, or nz is 1, then the corresponding dimx, dimy or dimz must not be set (or set 0 or 1)."); end
     if (any((nxyz .< 2 .* overlaps .- 1) .& (periods .> 0))) error("Incoherent arguments: if nx, ny, or nz is smaller than 2*overlaps[1]-1, 2*overlaps[2]-1 or 2*overlaps[3]-1, respectively, then the corresponding periodx, periody or periodz must not be set (or set 0)."); end
-    if (any(halowidths .> overlaps.÷2)) error("Incoherent arguments: halowidths cannot be greater than overlaps in each dimension."); end
+    if (any((overlaps .> 0) .& (halowidths .> overlaps.÷2))) error("Incoherent arguments: if overlap is greater than 0, then halowidth cannot be greater than overlap÷2, in each dimension."); end
     dims[(nxyz.==1).&(dims.==0)] .= 1;   # Setting any of nxyz to 1, means that the corresponding dimension must also be 1 in the global grid. Thus, the corresponding dims entry must be 1.
     if (init_MPI)  # NOTE: init MPI only, once the input arguments have been checked.
         if (MPI.Initialized()) error("MPI is already initialized. Set the argument 'init_MPI=false'."); end

From 006e9aec92b3ea34b49edf02837593adf0fef498 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 11:23:48 +0200
Subject: [PATCH 25/37] add documentation for non default halowidths and forth
 value greater than one

---
 src/update_halo.jl | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index e8936d1..93dca30 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -4,11 +4,15 @@ export update_halo!
     update_halo!(A)
     update_halo!(A...)
 
+!!! note "Advanced"
+        update_halo!(A, B, (A=C, halowidths=..., (A=D, halowidths=...), ...)
+
 Update the halo of the given GPU/CPU-array(s).
 
 # Typical use cases:
-    update_halo!(A)        # Update the halo of the array A.
-    update_halo!(A, B, C)  # Update the halos of the arrays A, B and C.
+    update_halo!(A)                                # Update the halo of the array A.
+    update_halo!(A, B, C)                          # Update the halos of the arrays A, B and C.
+    update_halo!(A, B, (A=C, halowidths=(2,2,2)))  # Update the halos of the arrays A, B, C, defining non default halowidth for C.
 
 !!! note "Performance note"
     Group subsequent calls to `update_halo!` in a single call for better performance (enables additional pipelining).
@@ -814,11 +818,19 @@ wrap_field(A::GGArray)                 = wrap_field(A, hw_default()...)
 
 # NOTE: no comparison must be done between the field-local halowidths and field-local overlaps because any combination is valid: the rational is that a field has simply no halo but only computation overlap in a given dimension if the corresponding local overlap is less than 2 times the local halowidth. This allows to determine whether a halo update needs to be done in a certain dimension or not.
 function check_fields(fields::GGField...)
+    # Raise an error if any of the given fields has a halowidth less than 1.
+    invalid_halowidths = [i for i=1:length(fields) if any([fields[i].halowidths[dim]<1 for dim=1:ndims(fields[i])])];
+    if length(invalid_halowidths) > 1
+        error("The fields at positions $(join(invalid_halowidths,", "," and ")) have a halowidth less than 1.")
+    elseif length(invalid_halowidths) > 0
+        error("The field at position $(invalid_halowidths[1]) has a halowidth less than 1.")
+    end
+    
     # Raise an error if any of the given fields has no halo at all (in any dimension) - in this case there is no halo update to do and including the field in the call is inconsistent.
     no_halo = Int[];
     for i = 1:length(fields)
         A, halowidths = fields[i]
-        if all([halowidths[dim]==0 || (ol(dim, A) < 2*halowidths[dim]) for dim = 1:ndims(A)]) # There is no halo if the overlap is less than 2 times the halowidth (only computation overlap in this case)...
+        if all([(ol(dim, A) < 2*halowidths[dim]) for dim = 1:ndims(A)]) # There is no halo if the overlap is less than 2 times the halowidth (only computation overlap in this case)...
             push!(no_halo, i);
         end
     end

From 49057484c0a0dddfb9f753e7c644d09eb1bd842d Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 11:26:32 +0200
Subject: [PATCH 26/37] add unit tests for default halowidths computation and
 forcing value greater than 1

---
 test/test_init_global_grid.jl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/test_init_global_grid.jl b/test/test_init_global_grid.jl
index ba975fb..9076a94 100644
--- a/test/test_init_global_grid.jl
+++ b/test/test_init_global_grid.jl
@@ -74,16 +74,19 @@ nz = 1;
     end;
 
     @testset "5. initialization with non-default overlaps and one periodic boundary" begin
-        nz  = 8;
-        olz = 3;
+        nz  = 10;
         olx = 3;
-        init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlaps=(olx, 2, olz), quiet=true, init_MPI=false);
+        oly = 0;
+        olz = 4;
+        init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlaps=(olx, oly, olz), quiet=true, init_MPI=false);
         @testset "initialized" begin
             @test GG.grid_is_initialized()
         end
         @testset "values in global grid" begin # (Checks only what is different than in the basic test.)
             @test GG.global_grid().nxyz_g    == [nx, ny, nz-olz]  # Note: olx has no effect as there is only 1 process and this boundary is not periodic.
             @test GG.global_grid().nxyz      == [nx, ny, nz    ]
+            @test GG.global_grid().overlaps  == [olx, oly, olz]
+            @test GG.global_grid().halowidths== [1, 1, 2]
             @test GG.global_grid().neighbors == [p0 p0 0; p0 p0 0]
             @test GG.global_grid().periods   == [0, 0, 1]
         end;
@@ -101,6 +104,7 @@ nz = 1;
         @test_throws ErrorException init_global_grid(nx, ny, 1, dimz=3, quiet=true, init_MPI=false);                 # Error: dimz>1 while nz==1.
         @test_throws ErrorException init_global_grid(nx, ny, 1, periodz=1, quiet=true, init_MPI=false);              # Error: periodz==1 while nz==1.
         @test_throws ErrorException init_global_grid(nx, ny, nz, periody=1, overlaps=(2,3,2), quiet=true, init_MPI=false); # Error: periody==1 while ny<2*overlaps[2]-1 (4<5).
+        @test_throws ErrorException init_global_grid(nx, ny, nz, halowidths=(1,0,1), quiet=true, init_MPI=false);    # Error: halowidths[2]<1.
         @test_throws ErrorException init_global_grid(nx, ny, nz, overlaps=(4,3,2), halowidths=(2,2,1), quiet=true, init_MPI=false); # Error: halowidths[2]==2 while overlaps[2]==3.
         @test_throws ErrorException init_global_grid(nx, ny, nz, quiet=true);                                        # Error: MPI already initialized
         @testset "already initialized exception" begin

From 57b8dbaf614f6349b2c6db5398183d51afa9ae8d Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 11:27:45 +0200
Subject: [PATCH 27/37] add unit tests for forcing value greater than one in
 update_halo!

---
 test/test_update_halo.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index e7e198a..0a1d54e 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -68,6 +68,7 @@ dz = 1.0
         Z2  = Z;
         @test_throws ErrorException update_halo!(P, Sxz, A)                     # Error: Sxz has no halo.
         @test_throws ErrorException update_halo!(P, Sxz, A, Sxz)                # Error: Sxz and Sxz have no halo.
+        @test_throws ErrorException update_halo!(A, (A=P, halowidths=(1,0,1)))  # Error: P has an invalid halowidth (less than 1).
         @test_throws ErrorException update_halo!(A, (A=P, halowidths=(2,2,2)))  # Error: P has no halo.
         @test_throws ErrorException update_halo!((A=A, halowidths=(0,3,2)), (A=P, halowidths=(2,2,2)))  # Error: A and P have no halo.
         @test_throws ErrorException update_halo!(P, A, A)                       # Error: A is given twice.

From 6e00b3c1f0c2b864547cc77f33ef556b08a2cc7c Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 11:47:14 +0200
Subject: [PATCH 28/37] move field related functions to shared

---
 src/shared.jl | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/shared.jl b/src/shared.jl
index f2d2b3d..4a75457 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -48,13 +48,6 @@ const CPUField{T,N}                   = GGField{T,N,Array{T,N}}
 const CuField{T,N}                    = GGField{T,N,CuArray{T,N}}
 const ROCField{T,N}                   = GGField{T,N,ROCArray{T,N}}
 
-Base.size(A::Union{GGField, CPUField, CuField, ROCField})          = Base.size(A.A)
-Base.size(A::Union{GGField, CPUField, CuField, ROCField}, args...) = Base.size(A.A, args...)
-Base.length(A::Union{GGField, CPUField, CuField, ROCField})        = Base.length(A.A)
-Base.ndims(A::Union{GGField, CPUField, CuField, ROCField})         = Base.ndims(A.A)
-Base.eltype(A::Union{GGField, CPUField, CuField, ROCField})        = Base.eltype(A.A)
-
-
 "An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests)
 struct GlobalGrid
     nxyz_g::Vector{GGInt}
@@ -127,6 +120,24 @@ is_cuarray(A::GGArray)                 = typeof(A) <: CuArray  #NOTE: this funct
 is_rocarray(A::GGArray)                = typeof(A) <: ROCArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 
 
+##--------------------------------------------------------------------------------
+## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS
+
+wrap_field(A::GGField)                 = A
+wrap_field(A::GGFieldConvertible)      = GGField(A)
+wrap_field(A::Array, hw::Tuple)        = CPUField{eltype(A), ndims(A)}((A, hw))
+wrap_field(A::CuArray, hw::Tuple)      = CuField{eltype(A), ndims(A)}((A, hw))
+wrap_field(A::ROCArray, hw::Tuple)     = ROCField{eltype(A), ndims(A)}((A, hw))
+wrap_field(A::GGArray, hw::Integer...) = wrap_field(A, hw)
+wrap_field(A::GGArray)                 = wrap_field(A, hw_default()...)
+
+Base.size(A::Union{GGField, CPUField, CuField, ROCField})          = Base.size(A.A)
+Base.size(A::Union{GGField, CPUField, CuField, ROCField}, args...) = Base.size(A.A, args...)
+Base.length(A::Union{GGField, CPUField, CuField, ROCField})        = Base.length(A.A)
+Base.ndims(A::Union{GGField, CPUField, CuField, ROCField})         = Base.ndims(A.A)
+Base.eltype(A::Union{GGField, CPUField, CuField, ROCField})        = Base.eltype(A.A)
+
+
 ##---------------
 ## CUDA functions
 

From 26d84e53cda260888c3fdc9643d8c34abcd42e08 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 11:47:38 +0200
Subject: [PATCH 29/37] move field related functions to shared

---
 src/update_halo.jl | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 93dca30..83a0f0a 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -802,19 +802,8 @@ function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
 end
 
 
-##--------------------------------------------------------
-## FUNCTIONS FOR WRAPPING AND CHECKING THE INPUT ARGUMENTS
-
-
-# Wrap the input argument into a GGField/CPUField/CuField/ROCField.
-wrap_field(A::GGField)                 = A
-wrap_field(A::GGFieldConvertible)      = GGField(A)
-wrap_field(A::Array, hw::Tuple)        = CPUField{eltype(A), ndims(A)}((A, hw))
-wrap_field(A::CuArray, hw::Tuple)      = CuField{eltype(A), ndims(A)}((A, hw))
-wrap_field(A::ROCArray, hw::Tuple)     = ROCField{eltype(A), ndims(A)}((A, hw))
-wrap_field(A::GGArray, hw::Integer...) = wrap_field(A, hw)
-wrap_field(A::GGArray)                 = wrap_field(A, hw_default()...)
-
+##-------------------------------------------
+## FUNCTIONS FOR CHECKING THE INPUT ARGUMENTS
 
 # NOTE: no comparison must be done between the field-local halowidths and field-local overlaps because any combination is valid: the rational is that a field has simply no halo but only computation overlap in a given dimension if the corresponding local overlap is less than 2 times the local halowidth. This allows to determine whether a halo update needs to be done in a certain dimension or not.
 function check_fields(fields::GGField...)

From 7a22ee9f286b4074f6d08ea3aa884d3ebe55ff92 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 16:54:11 +0200
Subject: [PATCH 30/37] fix max_halo_elems

---
 src/update_halo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 83a0f0a..189fe9c 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -169,7 +169,7 @@ let
                 if cuda_enabled() reinterpret_cubufs(T, i, n); end
                 if amdgpu_enabled() reinterpret_rocbufs(T, i, n); end
             end
-            max_halo_elems = (ndims(A) > 1) ? maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1])) : halowidths[1];
+            max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1]));
             if (length(sendbufs_raw[i][1]) < max_halo_elems)
                 for n = 1:NNEIGHBORS_PER_DIM
                     reallocate_bufs(T, i, n, max_halo_elems);

From c4e979470ac990cb52b9119208d89be436bbc6aa Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 16:57:35 +0200
Subject: [PATCH 31/37] add unit tests for buffer allocation and send receive
 ranges for halowidth>1

---
 test/test_update_halo.jl | 100 +++++++++++++++++++++++++++++++++++----
 1 file changed, 92 insertions(+), 8 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index 0a1d54e..9e2be5b 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -89,6 +89,8 @@ dz = 1.0
         Z = zeros(ComplexF16, nx,   ny,   nz  );
         Y = zeros(ComplexF16, nx-1, ny+2, nz+1);
         P, A, B, C, Z, Y = GG.wrap_field.((P, A, B, C, Z, Y));
+        halowidths = (3,1,2);
+        A_hw, Z_hw = GG.wrap_field(A.A, halowidths), GG.wrap_field(Z.A, halowidths);
         @testset "free buffers" begin
             @require GG.get_sendbufs_raw() === nothing
             @require GG.get_recvbufs_raw() === nothing
@@ -121,6 +123,18 @@ dz = 1.0
                 end
             end
         end;
+        @testset "allocate single (halowidth > 1)" begin
+            GG.free_update_halo_buffers();
+            GG.allocate_bufs(A_hw);
+            max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1]));
+            for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()]
+                @test length(bufs_raw)       == 1                                    # 1 array
+                @test length(bufs_raw[1])    == nneighbors_per_dim                   # 2 neighbors per dimension
+                for n = 1:nneighbors_per_dim
+                    @test length(bufs_raw[1][n]) >= max_halo_elems                   # required length: max halo elements in any of the dimensions
+                end
+            end
+        end;
         @testset "keep 1st, allocate 2nd" begin
             GG.free_update_halo_buffers();
             GG.allocate_bufs(P);
@@ -187,12 +201,16 @@ dz = 1.0
             GG.free_update_halo_buffers();
             GG.allocate_bufs(A, P);
             for dim = 1:ndims(A), n = 1:nneighbors_per_dim
-                @test all(length(sendbuf(n,dim,1,A)) .== prod(size(A)[1:ndims(A).!=dim]))
-                @test all(length(recvbuf(n,dim,1,A)) .== prod(size(A)[1:ndims(A).!=dim]))
+                @test all(length(sendbuf(n,dim,1,A))    .== prod(size(A)[1:ndims(A).!=dim]))
+                @test all(length(recvbuf(n,dim,1,A))    .== prod(size(A)[1:ndims(A).!=dim]))
+                @test all(size(sendbuf(n,dim,1,A))[dim] .== A.halowidths[dim])
+                @test all(size(recvbuf(n,dim,1,A))[dim] .== A.halowidths[dim])
             end
             for dim = 1:ndims(P), n = 1:nneighbors_per_dim
-                @test all(length(sendbuf(n,dim,2,P)) .== prod(size(P)[1:ndims(P).!=dim]))
-                @test all(length(recvbuf(n,dim,2,P)) .== prod(size(P)[1:ndims(P).!=dim]))
+                @test all(length(sendbuf(n,dim,2,P))    .== prod(size(P)[1:ndims(P).!=dim]))
+                @test all(length(recvbuf(n,dim,2,P))    .== prod(size(P)[1:ndims(P).!=dim]))
+                @test all(size(sendbuf(n,dim,2,P))[dim] .== P.halowidths[dim])
+                @test all(size(recvbuf(n,dim,2,P))[dim] .== P.halowidths[dim])
             end
         end;
         @testset "(cu/roc)sendbuf / (cu/roc)recvbuf (Complex)" begin
@@ -203,12 +221,44 @@ dz = 1.0
             GG.free_update_halo_buffers();
             GG.allocate_bufs(Y, Z);
             for dim = 1:ndims(Y), n = 1:nneighbors_per_dim
-                @test all(length(sendbuf(n,dim,1,Y)) .== prod(size(Y)[1:ndims(Y).!=dim]))
-                @test all(length(recvbuf(n,dim,1,Y)) .== prod(size(Y)[1:ndims(Y).!=dim]))
+                @test all(length(sendbuf(n,dim,1,Y))    .== prod(size(Y)[1:ndims(Y).!=dim]))
+                @test all(length(recvbuf(n,dim,1,Y))    .== prod(size(Y)[1:ndims(Y).!=dim]))
+                @test all(size(sendbuf(n,dim,1,Y))[dim] .== Y.halowidths[dim])
+                @test all(size(recvbuf(n,dim,1,Y))[dim] .== Y.halowidths[dim])
             end
             for dim = 1:ndims(Z), n = 1:nneighbors_per_dim
-                @test all(length(sendbuf(n,dim,2,Z)) .== prod(size(Z)[1:ndims(Z).!=dim]))
-                @test all(length(recvbuf(n,dim,2,Z)) .== prod(size(Z)[1:ndims(Z).!=dim]))
+                @test all(length(sendbuf(n,dim,2,Z))    .== prod(size(Z)[1:ndims(Z).!=dim]))
+                @test all(length(recvbuf(n,dim,2,Z))    .== prod(size(Z)[1:ndims(Z).!=dim]))
+                @test all(size(sendbuf(n,dim,2,Z))[dim] .== Z.halowidths[dim])
+                @test all(size(recvbuf(n,dim,2,Z))[dim] .== Z.halowidths[dim])
+            end
+        end;
+        @testset "(cu/roc)sendbuf / (cu/roc)recvbuf (halowidth > 1)" begin
+            sendbuf, recvbuf = (GG.sendbuf, GG.recvbuf);
+            if array_type in ["CUDA", "AMDGPU"]
+                sendbuf, recvbuf = (GG.gpusendbuf, GG.gpurecvbuf);
+            end
+            GG.free_update_halo_buffers();
+            GG.allocate_bufs(A_hw);
+            for dim = 1:ndims(A_hw), n = 1:nneighbors_per_dim
+                @test all(length(sendbuf(n,dim,1,A_hw))    .== prod(size(A_hw)[1:ndims(A_hw).!=dim])*A_hw.halowidths[dim])
+                @test all(length(recvbuf(n,dim,1,A_hw))    .== prod(size(A_hw)[1:ndims(A_hw).!=dim])*A_hw.halowidths[dim])
+                @test all(size(sendbuf(n,dim,1,A_hw))[dim] .== A_hw.halowidths[dim])
+                @test all(size(recvbuf(n,dim,1,A_hw))[dim] .== A_hw.halowidths[dim])
+            end
+        end;
+        @testset "(cu/roc)sendbuf / (cu/roc)recvbuf (halowidth > 1, Complex)" begin
+            sendbuf, recvbuf = (GG.sendbuf, GG.recvbuf);
+            if array_type in ["CUDA", "AMDGPU"]
+                sendbuf, recvbuf = (GG.gpusendbuf, GG.gpurecvbuf);
+            end
+            GG.free_update_halo_buffers();
+            GG.allocate_bufs(Z_hw);
+            for dim = 1:ndims(Z_hw), n = 1:nneighbors_per_dim
+                @test all(length(sendbuf(n,dim,1,Z_hw))    .== prod(size(Z_hw)[1:ndims(Z_hw).!=dim])*Z_hw.halowidths[dim])
+                @test all(length(recvbuf(n,dim,1,Z_hw))    .== prod(size(Z_hw)[1:ndims(Z_hw).!=dim])*Z_hw.halowidths[dim])
+                @test all(size(sendbuf(n,dim,1,Z_hw))[dim] .== Z_hw.halowidths[dim])
+                @test all(size(recvbuf(n,dim,1,Z_hw))[dim] .== Z_hw.halowidths[dim])
             end
         end;
         finalize_global_grid(finalize_MPI=false);
@@ -247,6 +297,40 @@ dz = 1.0
                 @test GG.recvranges(2, 3, A) == [            1:size(A,1),             1:size(A,2),     size(A,3):size(A,3)]
                 finalize_global_grid(finalize_MPI=false);
             end;
+            @testset "sendranges / recvranges (halowidth > 1, $array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
+                nx = 13;
+                ny = 9;
+                nz = 9;
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(6,4,4), halowidths=(3,1,2), quiet=true, init_MPI=false, device_type=device_type);
+                P   = zeros(nx,  ny,  nz  );
+                A   = zeros(nx-1,ny+2,nz+1);
+                P, A = GG.wrap_field.((P, A));
+                @test GG.sendranges(1, 1, P) == [                    4:6,             1:size(P,2),             1:size(P,3)]
+                @test GG.sendranges(2, 1, P) == [size(P,1)-5:size(P,1)-3,             1:size(P,2),             1:size(P,3)]
+                @test GG.sendranges(1, 2, P) == [            1:size(P,1),                     4:4,             1:size(P,3)]
+                @test GG.sendranges(2, 2, P) == [            1:size(P,1), size(P,2)-3:size(P,2)-3,             1:size(P,3)]
+                @test GG.sendranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     3:4]
+                @test GG.sendranges(2, 3, P) == [            1:size(P,1),             1:size(P,2), size(P,3)-3:size(P,3)-2]
+                @test GG.recvranges(1, 1, P) == [                    1:3,             1:size(P,2),             1:size(P,3)]
+                @test GG.recvranges(2, 1, P) == [  size(P,1)-2:size(P,1),             1:size(P,2),             1:size(P,3)]
+                @test GG.recvranges(1, 2, P) == [            1:size(P,1),                     1:1,             1:size(P,3)]
+                @test GG.recvranges(2, 2, P) == [            1:size(P,1),     size(P,2):size(P,2),             1:size(P,3)]
+                @test GG.recvranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     1:2]
+                @test GG.recvranges(2, 3, P) == [            1:size(P,1),             1:size(P,2),   size(P,3)-1:size(P,3)]
+                @test_throws ErrorException  GG.sendranges(1, 1, A)
+                @test_throws ErrorException  GG.sendranges(2, 1, A)
+                @test GG.sendranges(1, 2, A) == [            1:size(A,1),                     6:6,             1:size(A,3)]
+                @test GG.sendranges(2, 2, A) == [            1:size(A,1), size(A,2)-5:size(A,2)-5,             1:size(A,3)]
+                @test GG.sendranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     4:5]
+                @test GG.sendranges(2, 3, A) == [            1:size(A,1),             1:size(A,2), size(A,3)-4:size(A,3)-3]
+                @test_throws ErrorException  GG.recvranges(1, 1, A)
+                @test_throws ErrorException  GG.recvranges(2, 1, A)
+                @test GG.recvranges(1, 2, A) == [            1:size(A,1),                     1:1,             1:size(A,3)]
+                @test GG.recvranges(2, 2, A) == [            1:size(A,1),     size(A,2):size(A,2),             1:size(A,3)]
+                @test GG.recvranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     1:2]
+                @test GG.recvranges(2, 3, A) == [            1:size(A,1),             1:size(A,2),   size(A,3)-1:size(A,3)]
+                finalize_global_grid(finalize_MPI=false);
+            end;
             @testset "write_h2h! / read_h2h!" begin
                 init_global_grid(nx, ny, nz; quiet=true, init_MPI=false);
                 P  = zeros(nx,  ny,  nz  );

From 10febcae73d2f8d5bd409cfad1b788d273dcba3a Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 17:18:41 +0200
Subject: [PATCH 32/37] add unit tests for read and write for halowidth>1

---
 test/test_update_halo.jl | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index 9e2be5b..b93bcf6 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -360,6 +360,35 @@ dz = 1.0
                 @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
                 finalize_global_grid(finalize_MPI=false);
             end;
+            @testset "write_h2h! / read_h2h! (halowidth > 1)" begin
+                init_global_grid(nx, ny, nz; quiet=true, init_MPI=false);
+                P  = zeros(nx,  ny,  nz  );
+                P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+                P2 = zeros(size(P));
+                halowidths = (3,1,2);
+                # (dim=1)
+                buf = zeros(halowidths[1], size(P,2), size(P,3));
+                ranges = [4:6, 1:size(P,2), 1:size(P,3)];
+                GG.write_h2h!(buf, P, ranges, 1);
+                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+                GG.read_h2h!(buf, P2, ranges, 1);
+                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+                # (dim=2)
+                buf = zeros(size(P,1), halowidths[2], size(P,3));
+                ranges = [1:size(P,1), 4:4, 1:size(P,3)];
+                GG.write_h2h!(buf, P, ranges, 2);
+                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+                GG.read_h2h!(buf, P2, ranges, 2);
+                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+                # (dim=3)
+                buf = zeros(size(P,1), size(P,2), halowidths[3]);
+                ranges = [1:size(P,1), 1:size(P,2), 3:4];
+                GG.write_h2h!(buf, P, ranges, 3);
+                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+                GG.read_h2h!(buf, P2, ranges, 3);
+                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+                finalize_global_grid(finalize_MPI=false);
+            end;
             @static if test_cuda || test_amdgpu
                 @testset "write_d2x! / write_d2h_async! / read_x2d! / read_h2d_async! ($array_type arrays)" for (array_type, device_type, gpuzeros, GPUArray) in zip(gpu_array_types, gpu_device_types, gpu_allocators, GPUArrayConstructors)
                     init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);

From 11855468d75bbeef8fdc4bf8f8f34a67ae53ea7c Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 17:28:29 +0200
Subject: [PATCH 33/37] add unit tests for read and write for halowidth>1

---
 test/test_update_halo.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index b93bcf6..fa95de2 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -395,7 +395,7 @@ dz = 1.0
                     P  = zeros(nx,  ny,  nz  );
                     P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
                     P  = GPUArray(P);
-                    halowidths = (1,1,1)
+                    halowidths = (1,3,1)
                     if array_type == "CUDA"
                         # (dim=1)
                         dim = 1;
@@ -423,7 +423,7 @@ dz = 1.0
                         P2  = gpuzeros(eltype(P),size(P));
                         buf = zeros(size(P,1), halowidths[dim], size(P,3));
                         buf_d, buf_h = GG.register(CuArray,buf);
-                        ranges = [1:size(P,1), 3:3, 1:size(P,3)];
+                        ranges = [1:size(P,1), 4:6, 1:size(P,3)];
                         nthreads = (1, 1, 1);
                         halosize = [r[end] - r[1] + 1 for r in ranges];
                         nblocks  = Tuple(ceil.(Int, halosize./nthreads));
@@ -487,7 +487,7 @@ dz = 1.0
                         P2  = gpuzeros(eltype(P),size(P));
                         buf = zeros(size(P,1), halowidths[dim], size(P,3));
                         buf_d = GG.register(ROCArray,buf);
-                        ranges = [1:size(P,1), 3:3, 1:size(P,3)];
+                        ranges = [1:size(P,1), 4:6, 1:size(P,3)];
                         nthreads = (1, 1, 1);
                         halosize = [r[end] - r[1] + 1 for r in ranges];
                         nblocks  = Tuple(ceil.(Int, halosize./nthreads));

From f8304e8a48113e4f3892e26a4cd944939b1f525e Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 6 Oct 2023 17:55:35 +0200
Subject: [PATCH 34/37] add unit tests for read and write for halowidth>1

---
 test/test_update_halo.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index fa95de2..e950af2 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -423,7 +423,7 @@ dz = 1.0
                         P2  = gpuzeros(eltype(P),size(P));
                         buf = zeros(size(P,1), halowidths[dim], size(P,3));
                         buf_d, buf_h = GG.register(CuArray,buf);
-                        ranges = [1:size(P,1), 4:6, 1:size(P,3)];
+                        ranges = [1:size(P,1), 2:4, 1:size(P,3)];
                         nthreads = (1, 1, 1);
                         halosize = [r[end] - r[1] + 1 for r in ranges];
                         nblocks  = Tuple(ceil.(Int, halosize./nthreads));
@@ -487,7 +487,7 @@ dz = 1.0
                         P2  = gpuzeros(eltype(P),size(P));
                         buf = zeros(size(P,1), halowidths[dim], size(P,3));
                         buf_d = GG.register(ROCArray,buf);
-                        ranges = [1:size(P,1), 4:6, 1:size(P,3)];
+                        ranges = [1:size(P,1), 2:4, 1:size(P,3)];
                         nthreads = (1, 1, 1);
                         halosize = [r[end] - r[1] + 1 for r in ranges];
                         nblocks  = Tuple(ceil.(Int, halosize./nthreads));

From 0ccd9f32162e3b14f3b7daae8ccce465a691ae66 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 9 Oct 2023 19:55:49 +0200
Subject: [PATCH 35/37] add halowidth>1 unit tests for high level read and
 write and update halo

---
 test/test_update_halo.jl | 77 ++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index e950af2..e8e7b44 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -529,7 +529,7 @@ dz = 1.0
                 end;
             end
             @testset "iwrite_sendbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type);
                 P = zeros(nx,  ny,  nz  );
                 A = zeros(nx-1,ny+2,nz+1);
                 P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]);
@@ -547,10 +547,10 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[2,:,:][:]))) # DEBUG: here and later, CPUArray is needed to avoid error in AMDGPU because of mapreduce
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[3:4,:,:][:]))) # DEBUG: here and later, CPUArray is needed to avoid error in AMDGPU because of mapreduce
                     @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0))
                 else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[2,:,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[3:4,:,:][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
                 end
                 n = 2
@@ -559,10 +559,10 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[end-1,:,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[end-2:end-3,:,:][:])))
                     @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0))
                 else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[end-1,:,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[end-3:end-2,:,:][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
                 end
                 dim = 2
@@ -618,7 +618,7 @@ dz = 1.0
                 finalize_global_grid(finalize_MPI=false);
             end;
             @testset "iread_recvbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type);
                 P = zeros(nx,  ny,  nz  );
                 A = zeros(nx-1,ny+2,nz+1);
                 P, A = GG.wrap_field.((P, A));
@@ -643,11 +643,11 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[1,:,:][:])))
-                    @test all(CPUArray(                          0.0 .== Array(A.A[1,:,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[1:2,:,:][:])))
+                    @test all(CPUArray(                          0.0 .== Array(A.A[1:2,:,:][:])))
                 else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[1,:,:][:]))
-                    @test all(                       0.0 .== CPUArray(A.A[1,:,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[1:2,:,:][:]))
+                    @test all(                       0.0 .== CPUArray(A.A[1:2,:,:][:]))
                 end
                 n = 2
                 GG.iread_recvbufs!(n, dim, P, 1);
@@ -655,11 +655,11 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[end,:,:][:])))
-                    @test all(CPUArray(                          0.0 .== Array(A.A[end,:,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[end-1:end,:,:][:])))
+                    @test all(CPUArray(                          0.0 .== Array(A.A[end-1:end,:,:][:])))
                 else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[end,:,:][:]))
-                    @test all(                       0.0 .== CPUArray(A.A[end,:,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[end-1:end,:,:][:]))
+                    @test all(                       0.0 .== CPUArray(A.A[end-1:end,:,:][:]))
                 end
                 dim = 2
                 for n = 1:nneighbors_per_dim
@@ -733,7 +733,7 @@ dz = 1.0
             end;
             if (nprocs==1)
                 @testset "sendrecv_halo_local ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-                    init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type);
+                    init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type);
                     P = zeros(nx,  ny,  nz  );
                     A = zeros(nx-1,ny+2,nz+1);
                     P, A = GG.wrap_field.((P, A));
@@ -819,7 +819,7 @@ dz = 1.0
         end;
         if (nprocs>1)
             @testset "irecv_halo! / isend_halo ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-                me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, quiet=true, init_MPI=false, device_type=device_type);
+                me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, overlaps=(4,4,4), halowidths=(2,1,2), quiet=true, init_MPI=false, device_type=device_type);
                 P   = zeros(nx,ny,nz);
                 A   = zeros(nx,ny,nz);
                 P, A = GG.wrap_field.((P, A));
@@ -912,14 +912,14 @@ dz = 1.0
                 @test all(CPUArray(P .== P_ref))
                 finalize_global_grid(finalize_MPI=false);
             end;
-            @testset "3D (non-default overlap)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), quiet=true, init_MPI=false, device_type=device_type);
+            @testset "3D (non-default overlap and halowidth)" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type);
                 P     = zeros(nx, ny, nz);
                 P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
                 P_ref = copy(P);
-                P[[1, end],       :,       :] .= 0.0;
-                P[       :,[1, end],       :] .= 0.0;
-                P[       :,       :,[1, end]] .= 0.0;
+                P[[1,2, end-1,end],       :,       :] .= 0.0;
+                P[               :,[1, end],       :] .= 0.0;
+                P[               :,       :,[1, end]] .= 0.0;
                 P     = Array(P);
                 P_ref = Array(P_ref);
                 @require !all(CPUArray(P .== P_ref))
@@ -992,14 +992,14 @@ dz = 1.0
                 @test all(CPUArray(Vz .== Vz_ref))
                 finalize_global_grid(finalize_MPI=false);
             end;
-            @testset "3D (non-default overlap)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(3,2,3), quiet=true, init_MPI=false, device_type=device_type);
+            @testset "3D (non-default overlap and halowidth)" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type);
                 Vx     = zeros(nx+1,ny,nz);
                 Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
                 Vx_ref = copy(Vx);
-                Vx[[1, end],       :,       :] .= 0.0;
-                Vx[       :,[1, end],       :] .= 0.0;
-                Vx[       :,       :,[1, end]] .= 0.0;
+                Vx[[1,2, end-1,end],       :,       :] .= 0.0;
+                Vx[               :,[1, end],       :] .= 0.0;
+                Vx[               :,       :,[1, end]] .= 0.0;
                 Vx     = Array(Vx);
                 Vx_ref = Array(Vx_ref);
                 @require !all(CPUArray(Vx .== Vx_ref))
@@ -1175,6 +1175,31 @@ dz = 1.0
                 @test all(CPUArray(Vx .== Vx_ref))
                 finalize_global_grid(finalize_MPI=false);
             end;
+            @testset "3D (two fields simultaneously, non-default overlap and halowidth)" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type);
+                Vz     = zeros(nx,ny,nz+1);
+                Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+                Vz_ref = copy(Vz);
+                Vx     = zeros(nx+1,ny,nz);
+                Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+                Vx_ref = copy(Vx);
+                Vz[[1,2, end-1,end],       :,       :] .= 0.0;
+                Vz[               :,[1, end],       :] .= 0.0;
+                Vz[               :,       :,[1, end]] .= 0.0;
+                Vx[[1,2, end-1,end],       :,       :] .= 0.0;
+                Vx[               :,[1, end],       :] .= 0.0;
+                Vx[               :,       :,[1, end]] .= 0.0;
+                Vz     = Array(Vz);
+                Vz_ref = Array(Vz_ref);
+                Vx     = Array(Vx);
+                Vx_ref = Array(Vx_ref);
+                @require !all(CPUArray(Vz .== Vz_ref))
+                @require !all(CPUArray(Vx .== Vx_ref))
+                update_halo!(Vz, Vx);
+                @test all(CPUArray(Vz .== Vz_ref))
+                @test all(CPUArray(Vx .== Vx_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
         end;
     end;
 end;

From 5aa02b5b6045e14aa573f1a3e3487f3210aa4c6f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 12 Oct 2023 18:27:21 +0200
Subject: [PATCH 36/37] fixed unit test for GPU-aware

---
 test/test_update_halo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index e8e7b44..3a742fd 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -559,7 +559,7 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[end-2:end-3,:,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[end-3:end-2,:,:][:])))
                     @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0))
                 else
                     @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[end-3:end-2,:,:][:]))

From d6f0413621953c8338b3b096c5a944ceea1b7c1f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 1 Dec 2023 17:34:12 +0100
Subject: [PATCH 37/37] add MPIPreferences to test and support for CUDA 5

---
 Project.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 495398f..58289eb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,7 @@ version = "0.13.0"
 
 [compat]
 AMDGPU = "0.5"
-CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4"
+CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4, 5"
 LoopVectorization = "0.12"
 MPI = "0.20"
 julia = "1.9"
@@ -18,7 +18,8 @@ MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 
 [extras]
 CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
+MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test", "MPIPreferences"]