From 647a1cba9e6e19b3a95590c6cbc8960e1dea7519 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Thu, 29 Jun 2023 18:09:42 +0300
Subject: [PATCH 01/21] Update to support HIP BE in AMDGPU 0.5.0

---
 Project.toml         |   6 +-
 src/select_device.jl |   4 +-
 src/shared.jl        |   2 +-
 src/update_halo.jl   | 144 +++++++++++++++++--------------------------
 4 files changed, 61 insertions(+), 95 deletions(-)

diff --git a/Project.toml b/Project.toml
index 86259e4..21435ca 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,10 +4,10 @@ uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
 version = "0.12.0"
 
 [compat]
-AMDGPU = "0.3.7"
-CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12"
+AMDGPU = "0.5"
+CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, 4"
 LoopVectorization = "0.12"
-MPI = "0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19"
+MPI = "0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20"
 julia = "1.7"
 
 [deps]
diff --git a/src/select_device.jl b/src/select_device.jl
index 62710f7..3c6a340 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -20,13 +20,13 @@ function select_device()
             nb_devices = length(CUDA.devices())
         elseif amdgpu_enabled()
             @assert AMDGPU.functional()
-            nb_devices = length(AMDGPU.get_agents(:gpu))
+            nb_devices = length(AMDGPU.devices())
         end
         comm_l = MPI.Comm_split_type(comm(), MPI.MPI_COMM_TYPE_SHARED, me())
         if (MPI.Comm_size(comm_l) > nb_devices) error("More processes have been launched per node than there are GPUs available."); end
         me_l = MPI.Comm_rank(comm_l)
         if     cuda_enabled()   CUDA.device!(me_l)
-        elseif amdgpu_enabled() AMDGPU.device!(me_l+1)
+        elseif amdgpu_enabled() AMDGPU.device_id!(me_l+1)
         end
         return me_l
     else
diff --git a/src/shared.jl b/src/shared.jl
index 21f40b7..961726c 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -125,5 +125,5 @@ end
 ## AMDGPU functions
 
 function register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber
-    return unsafe_wrap(ROCArray,pointer(buf),size(buf)), pointer(buf);
+    return unsafe_wrap(ROCArray, pointer(buf), size(buf)), pointer(buf);
 end
diff --git a/src/update_halo.jl b/src/update_halo.jl
index 451bbf2..889e90c 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -124,7 +124,7 @@ let
             for i = 1:length(bufs)
                 for n = 1:length(bufs[i])
                     if is_cuarray(bufs[i][n])  CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end
-                    # if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU
+                    if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU
                 end
             end
         end
@@ -469,38 +469,21 @@ end
 
 # (AMDGPU functions)
 
-function allocate_rocqueues(fields::GGArray...)
-    allocate_rocqueues_iwrite(fields...);
-    allocate_rocqueues_iread(fields...);
+function allocate_rocstreams(fields::GGArray...)
+    allocate_rocstreams_iwrite(fields...);
+    allocate_rocstreams_iread(fields...);
 end
 
 let
-    global iwrite_sendbufs!, allocate_rocqueues_iwrite, wait_iwrite
+    global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite
 
-    rocqueues  = Array{AMDGPU.HSAQueue}(undef, NNEIGHBORS_PER_DIM, 0)
-    rocsignals = Array{Union{AMDGPU.HSASignal,AMDGPU.RuntimeEvent{AMDGPU.HSAStatusSignal},Missing}}(undef, NNEIGHBORS_PER_DIM, 0)
+    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    function wait_iwrite(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
-        if !ismissing(rocsignals[n,i]) # DEBUG: tmp solution to avoid rocsignals array access filing when accessing an unset signal
-            wait(rocsignals[n,i]);
-            rocsignals[n,i] = missing;
-        end
-    end
+    wait_iwrite(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = synchronize(rocstreams[n,i]);
 
-    function allocate_rocqueues_iwrite(fields::GGArray...)
-        if length(fields) > size(rocqueues,2)  # Note: for simplicity, we create a queue for every field even if it is not a ROCArray
-            nqueues = length(fields)-size(rocqueues,2);
-            new_rocqueues  = Array{AMDGPU.HSAQueue}(undef, NNEIGHBORS_PER_DIM, nqueues);
-            new_rocsignals = Array{Union{AMDGPU.HSASignal,AMDGPU.RuntimeEvent{AMDGPU.HSAStatusSignal},Missing}}(missing, NNEIGHBORS_PER_DIM, nqueues); # DEBUG: tmp solution to avoid rocsignals array access filing when accessing an unset signal
-            for i = 1:nqueues
-                for n=1:NNEIGHBORS_PER_DIM
-                    q = AMDGPU.HSAQueue(get_default_agent())
-                    AMDGPU.HSA.amd_queue_set_priority(q.queue, AMDGPU.HSA.AMD_QUEUE_PRIORITY_HIGH)
-                    new_rocqueues[n,i] = q
-                end
-            end
-            rocqueues  = [rocqueues  new_rocqueues]
-            rocsignals = [rocsignals new_rocsignals]
+    function allocate_rocstreams_iwrite(fields::GGArray...)
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
+            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
     end
 
@@ -508,46 +491,28 @@ let
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
             if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges   = sendranges(n, dim, A);
+                ranges = sendranges(n, dim, A);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = Tuple([r[end] - r[1] + 1 for r in ranges]);
-                rocsignals[n,i] = @roc gridsize=halosize groupsize=nthreads queue=rocqueues[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim); # DEBUG: usually @roc is wrapped by wait(), but since we don't want sync one should check what to do.
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
             else
-                rocsignals[n,i] = HSASignal()
-                write_d2h_async!(sendbuf_flat(n,dim,i,A),A,sendranges(n,dim,A),rocsignals[n,i]);
+                Base.copyto!(sendbuf_flat(n,dim,i,A), 1, A, 1,sendranges(n,dim,A); async=true)
             end
         end
     end
 end
 
 let
-    global iread_recvbufs!, allocate_rocqueues_iread, wait_iread
+    global iread_recvbufs!, allocate_rocstreams_iread, wait_iread
 
-    rocqueues  = Array{AMDGPU.HSAQueue}(undef, NNEIGHBORS_PER_DIM, 0)
-    rocsignals = Array{Union{AMDGPU.HSASignal,AMDGPU.RuntimeEvent{AMDGPU.HSAStatusSignal},Missing}}(undef, NNEIGHBORS_PER_DIM, 0)
+    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    function wait_iread(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
-        if !ismissing(rocsignals[n,i]) # DEBUG: tmp solution to avoid rocsignals array access filing when accessing an unset signal
-            wait(rocsignals[n,i]);
-            rocsignals[n,i] = missing;
-        end
-        return
-    end
-
-    function allocate_rocqueues_iread(fields::GGArray...)
-        if length(fields) > size(rocqueues,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
-            nqueues = length(fields)-size(rocqueues,2);
-            new_rocqueues  = Array{AMDGPU.HSAQueue}(undef, NNEIGHBORS_PER_DIM, nqueues);
-            new_rocsignals = Array{Union{AMDGPU.HSASignal,AMDGPU.RuntimeEvent{AMDGPU.HSAStatusSignal},Missing}}(missing, NNEIGHBORS_PER_DIM, nqueues); # DEBUG: tmp solution to avoid rocsignals array access filing when accessing an unset signal
-            for i = 1:nqueues
-                for n=1:NNEIGHBORS_PER_DIM
-                    q = AMDGPU.HSAQueue(get_default_agent())
-                    AMDGPU.HSA.amd_queue_set_priority(q.queue, AMDGPU.HSA.AMD_QUEUE_PRIORITY_HIGH)
-                    new_rocqueues[n,i] = q
-                end
-            end
-            rocqueues  = [rocqueues  new_rocqueues]
-            rocsignals = [rocsignals new_rocsignals]
+    wait_iread(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = synchronize(rocstreams[n,i]);
+
+    function allocate_rocstreams_iread(fields::GGArray...)
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
+            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
     end
 
@@ -555,13 +520,14 @@ let
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
             if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges   = recvranges(n, dim, A);
+                ranges = recvranges(n, dim, A);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = Tuple([r[end] - r[1] + 1 for r in ranges]);
-                rocsignals[n,i] = @roc gridsize=halosize groupsize=nthreads queue=rocqueues[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
             else
-                rocsignals[n,i] = HSASignal()
-                read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocsignals[n,i]);
+                # read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocsignals[n,i]);
+                Base.copyto!(recvbuf_flat(n,dim,i,A), 1, A, 1,recvranges(n,dim,A))
             end
         end
     end
@@ -708,33 +674,33 @@ function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrang
     return nothing
 end
 
-# Write to the send buffer on the host from the array on the device (d2h).
-function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, signal::HSASignal) where T <: GGNumber where T2 <: Integer
-    locked_ptr = convert(Ptr{T}, AMDGPU.Mem.lock(pointer(sendbuf),sizeof(sendbuf),get_default_agent()))
-    AMDGPU.Mem.unsafe_copy3d!(
-        locked_ptr, pointer(A),
-        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
-        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
-        srcPitch=sizeof(T)*size(A,1), srcSlice=sizeof(T)*size(A,1)*size(A,2),
-        dstPitch=sizeof(T)*length(sendranges[1]), dstSlice=sizeof(T)*length(sendranges[1])*length(sendranges[2]),
-        async=true, signal=signal
-    )
-    return nothing
-end
-
-# Read from the receive buffer on the host and store on the array on the device (h2d).
-function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, signal::HSASignal) where T <: GGNumber where T2 <: Integer
-    locked_ptr = convert(Ptr{T}, AMDGPU.Mem.lock(pointer(recvbuf),sizeof(recvbuf),get_default_agent()))
-    AMDGPU.Mem.unsafe_copy3d!(
-        pointer(A), locked_ptr,
-        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
-        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
-        srcPitch=sizeof(T)*length(recvranges[1]), srcSlice=sizeof(T)*length(recvranges[1])*length(recvranges[2]),
-        dstPitch=sizeof(T)*size(A,1), dstSlice=sizeof(T)*size(A,1)size(A,2),
-        async=true, signal=signal
-    )
-    return nothing
-end
+# # Write to the send buffer on the host from the array on the device (d2h).
+# function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, signal::HSASignal) where T <: GGNumber where T2 <: Integer
+#     locked_ptr = convert(Ptr{T}, AMDGPU.Mem.lock(pointer(sendbuf),sizeof(sendbuf),get_default_agent()))
+#     AMDGPU.Mem.unsafe_copy3d!(
+#         locked_ptr, pointer(A),
+#         length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+#         srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+#         srcPitch=sizeof(T)*size(A,1), srcSlice=sizeof(T)*size(A,1)*size(A,2),
+#         dstPitch=sizeof(T)*length(sendranges[1]), dstSlice=sizeof(T)*length(sendranges[1])*length(sendranges[2]),
+#         async=true, signal=signal
+#     )
+#     return nothing
+# end
+
+# # Read from the receive buffer on the host and store on the array on the device (h2d).
+# function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, signal::HSASignal) where T <: GGNumber where T2 <: Integer
+#     locked_ptr = convert(Ptr{T}, AMDGPU.Mem.lock(pointer(recvbuf),sizeof(recvbuf),get_default_agent()))
+#     AMDGPU.Mem.unsafe_copy3d!(
+#         pointer(A), locked_ptr,
+#         length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+#         dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+#         srcPitch=sizeof(T)*length(recvranges[1]), srcSlice=sizeof(T)*length(recvranges[1])*length(recvranges[2]),
+#         dstPitch=sizeof(T)*size(A,1), dstSlice=sizeof(T)*size(A,1)size(A,2),
+#         async=true, signal=signal
+#     )
+#     return nothing
+# end
 
 
 ##------------------------------

From d61d39152c07c724b8dbb6e4ef10d0e04f702d7d Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Thu, 29 Jun 2023 22:54:07 +0300
Subject: [PATCH 02/21] More fixes

---
 Project.toml                  |  6 ++---
 src/init_global_grid.jl       |  2 +-
 src/select_device.jl          |  2 +-
 src/shared.jl                 |  2 +-
 src/update_halo.jl            | 22 +++++++++++-----
 test/test_init_global_grid.jl |  2 +-
 test/test_select_device.jl    |  2 +-
 test/test_update_halo.jl      | 48 +++++++++++++++++------------------
 8 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/Project.toml b/Project.toml
index 21435ca..20c6a7c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,13 +1,13 @@
-authors = ["Samuel Omlin", "Ludovic Räss", "Ivan Utkin"]
+authors = ["Samuel Omlin", "Ludovic Raess", "Ivan Utkin"]
 name = "ImplicitGlobalGrid"
 uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
 version = "0.12.0"
 
 [compat]
 AMDGPU = "0.5"
-CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, 4"
+CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4"
 LoopVectorization = "0.12"
-MPI = "0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20"
+MPI = "0.20"
 julia = "1.7"
 
 [deps]
diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index da53f6c..cc77591 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -86,7 +86,7 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
     comm_cart = MPI.Cart_create(comm, dims, periods, reorder);
     me        = MPI.Comm_rank(comm_cart);
     coords    = MPI.Cart_coords(comm_cart);
-    neighbors = fill(MPI.MPI_PROC_NULL, NNEIGHBORS_PER_DIM, NDIMS_MPI);
+    neighbors = fill(MPI.PROC_NULL, NNEIGHBORS_PER_DIM, NDIMS_MPI);
     for i = 1:NDIMS_MPI
         neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp);
     end
diff --git a/src/select_device.jl b/src/select_device.jl
index 3c6a340..a54ef4e 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -22,7 +22,7 @@ function select_device()
             @assert AMDGPU.functional()
             nb_devices = length(AMDGPU.devices())
         end
-        comm_l = MPI.Comm_split_type(comm(), MPI.MPI_COMM_TYPE_SHARED, me())
+        comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me())
         if (MPI.Comm_size(comm_l) > nb_devices) error("More processes have been launched per node than there are GPUs available."); end
         me_l = MPI.Comm_rank(comm_l)
         if     cuda_enabled()   CUDA.device!(me_l)
diff --git a/src/shared.jl b/src/shared.jl
index 961726c..9c9700b 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -102,7 +102,7 @@ amdgpuaware_MPI()                      = global_grid().amdgpuaware_MPI
 amdgpuaware_MPI(dim::Integer)          = global_grid().amdgpuaware_MPI[dim]
 loopvectorization()                    = global_grid().loopvectorization
 loopvectorization(dim::Integer)        = global_grid().loopvectorization[dim]
-has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.MPI_PROC_NULL
+has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.PROC_NULL
 any_array(fields::GGArray...)          = any([is_array(A) for A in fields])
 any_cuarray(fields::GGArray...)        = any([is_cuarray(A) for A in fields])
 any_rocarray(fields::GGArray...)       = any([is_rocarray(A) for A in fields])
diff --git a/src/update_halo.jl b/src/update_halo.jl
index 889e90c..eaeb214 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -17,7 +17,7 @@ Update the halo of the given GPU/CPU-array(s).
 function update_halo!(A::GGArray...)
     check_initialized();
     check_fields(A...);
-    _update_halo!(A...);  # Asignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user).
+    _update_halo!(A...);  # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user).
     return nothing
 end
 
@@ -482,7 +482,7 @@ let
     wait_iwrite(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = synchronize(rocstreams[n,i]);
 
     function allocate_rocstreams_iwrite(fields::GGArray...)
-        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCArray
             rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
     end
@@ -497,7 +497,7 @@ let
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                 @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
             else
-                Base.copyto!(sendbuf_flat(n,dim,i,A), 1, A, 1,sendranges(n,dim,A); async=true)
+                write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
             end
         end
     end
@@ -511,7 +511,7 @@ let
     wait_iread(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = synchronize(rocstreams[n,i]);
 
     function allocate_rocstreams_iread(fields::GGArray...)
-        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCArray
             rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
         end
     end
@@ -526,8 +526,7 @@ let
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                 @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
             else
-                # read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocsignals[n,i]);
-                Base.copyto!(recvbuf_flat(n,dim,i,A), 1, A, 1,recvranges(n,dim,A))
+                read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
             end
         end
     end
@@ -687,6 +686,11 @@ end
 #     )
 #     return nothing
 # end
+function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    AMDGPU.stream!(rocstream)
+    AMDGPU.Base.copyto!(sendbuf, 1, A, 1, sendranges; async=true)
+    return nothing
+end
 
 # # Read from the receive buffer on the host and store on the array on the device (h2d).
 # function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, signal::HSASignal) where T <: GGNumber where T2 <: Integer
@@ -701,7 +705,11 @@ end
 #     )
 #     return nothing
 # end
-
+function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    AMDGPU.stream!(rocstream)
+    AMDGPU.Base.copyto!(recvbuf, 1, A, 1, recvranges)
+    return nothing
+end
 
 ##------------------------------
 ## FUNCTIONS TO SEND/RECV FIELDS
diff --git a/test/test_init_global_grid.jl b/test/test_init_global_grid.jl
index 228e3ad..f24343e 100644
--- a/test/test_init_global_grid.jl
+++ b/test/test_init_global_grid.jl
@@ -6,7 +6,7 @@ import ImplicitGlobalGrid: @require
 
 
 ## Test setup (NOTE: Testset "2. initialization including MPI" completes the test setup as it initializes MPI and must therefore mandatorily be at the 2nd position). NOTE: these tests require nprocs == 1.
-p0 = MPI.MPI_PROC_NULL
+p0 = MPI.PROC_NULL
 nx = 4;
 ny = 4;
 nz = 1;
diff --git a/test/test_select_device.jl b/test/test_select_device.jl
index bd3fba1..5f80c63 100644
--- a/test/test_select_device.jl
+++ b/test/test_select_device.jl
@@ -25,7 +25,7 @@ nprocs = MPI.Comm_size(MPI.COMM_WORLD); # NOTE: these tests can run with any num
         @static if test_amdgpu
             me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU");
             gpu_id = select_device();
-            @test gpu_id < length(AMDGPU.device())
+            @test gpu_id < length(AMDGPU.devices())
             finalize_global_grid(finalize_MPI=false);
         end
         @static if !(test_cuda || test_amdgpu)
diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index 33ae863..66b2bac 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -348,60 +348,60 @@ dz = 1.0
                         buf_d, buf_h = GG.register(ROCArray,buf);
                         ranges = [2:2, 1:size(P,2), 1:size(P,3)];
                         nthreads = (1, 1, 1);
-                        halosize = Tuple([r[end] - r[1] + 1 for r in ranges]);
-                        wait( @roc gridsize=halosize groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim) );
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        wait( @roc gridsize=halosize groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim) );
+                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         buf .= 0.0;
                         P2  .= 0.0;
-                        rocsignal = HSASignal()
-                        GG.write_d2h_async!(buf, P, ranges, rocsignal); wait(rocsignal);
+                        rocstream = AMDGPU.HIPStream();
+                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        rocsignal = HSASignal()
-                        GG.read_h2d_async!(buf, P2, ranges, rocsignal); wait(rocsignal);
+                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         AMDGPU.Mem.unlock(buf_h);
                         # (dim=2)
                         dim = 2;
                         P2  = gpuzeros(eltype(P),size(P));
                         buf = zeros(size(P,1), size(P,3));
-                        buf_d, buf_h = GG.register(ROCArray,buf);
+                        buf_d, buf_h = GG.register(CuArray,buf);
                         ranges = [1:size(P,1), 3:3, 1:size(P,3)];
                         nthreads = (1, 1, 1);
-                        halosize = Tuple([r[end] - r[1] + 1 for r in ranges]);
-                        wait( @roc gridsize=halosize groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim) );
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        wait( @roc gridsize=halosize groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim) );
+                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         buf .= 0.0;
                         P2  .= 0.0;
-                        rocsignal = HSASignal()
-                        GG.write_d2h_async!(buf, P, ranges, rocsignal); wait(rocsignal);
+                        rocstream = AMDGPU.HIPStream();
+                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        rocsignal = HSASignal()
-                        GG.read_h2d_async!(buf, P2, ranges, rocsignal); wait(rocsignal);
+                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         AMDGPU.Mem.unlock(buf_h);
                         # (dim=3)
                         dim = 3
                         P2  = gpuzeros(eltype(P),size(P));
                         buf = zeros(size(P,1), size(P,2));
-                        buf_d, buf_h = GG.register(ROCArray,buf);
+                        buf_d, buf_h = GG.register(CuArray,buf);
                         ranges = [1:size(P,1), 1:size(P,2), 4:4];
                         nthreads = (1, 1, 1);
-                        halosize = Tuple([r[end] - r[1] + 1 for r in ranges]);
-                        wait( @roc gridsize=halosize groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim) );
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        wait( @roc gridsize=halosize groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim) );
+                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         buf .= 0.0;
                         P2  .= 0.0;
-                        rocsignal = HSASignal()
-                        GG.write_d2h_async!(buf, P, ranges, rocsignal); wait(rocsignal);
+                        rocstream = AMDGPU.HIPStream();
+                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        rocsignal = HSASignal()
-                        GG.read_h2d_async!(buf, P2, ranges, rocsignal); wait(rocsignal);
+                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         AMDGPU.Mem.unlock(buf_h);
                     end
@@ -1050,4 +1050,4 @@ dz = 1.0
 end;
 
 ## Test tear down
-MPI.Finalize()
+MPI.Finalize()
\ No newline at end of file

From 80a78759b8ea3793f5196fe5b8e7edc13f4978d0 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Fri, 30 Jun 2023 16:05:36 +0300
Subject: [PATCH 03/21] Bump version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 20c6a7c..532ba78 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 authors = ["Samuel Omlin", "Ludovic Raess", "Ivan Utkin"]
 name = "ImplicitGlobalGrid"
 uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
-version = "0.12.0"
+version = "0.13.0"
 
 [compat]
 AMDGPU = "0.5"

From b4695b981e98a22e040d02af60945a4baf254c70 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Sat, 15 Jul 2023 00:30:23 +0300
Subject: [PATCH 04/21] Add AMDGPU support - WIP

---
 src/shared.jl            |    5 +-
 src/update_halo.jl       |   42 +-
 test/runtests.jl         |    4 +-
 test/test_update_halo.jl | 1687 +++++++++++++++++++-------------------
 4 files changed, 872 insertions(+), 866 deletions(-)

diff --git a/src/shared.jl b/src/shared.jl
index 9c9700b..8770782 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -125,5 +125,8 @@ end
 ## AMDGPU functions
 
 function register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber
-    return unsafe_wrap(ROCArray, pointer(buf), size(buf)), pointer(buf);
+    # dbuf = AMDGPU.unsafe_wrap(ROCArray, pointer(buf), size(buf))
+    # rbuf = dbuf.buf
+    # return dbuf, dbuf.buf
+    return unsafe_wrap(ROCArray, pointer(buf), size(buf))
 end
diff --git a/src/update_halo.jl b/src/update_halo.jl
index eaeb214..0e5dca4 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -91,18 +91,18 @@ let
     curecvbufs_raw_h = nothing
     rocsendbufs_raw = nothing
     rocrecvbufs_raw = nothing
-    rocsendbufs_raw_h = nothing
-    rocrecvbufs_raw_h = nothing
+    # rocsendbufs_raw_h = nothing
+    # rocrecvbufs_raw_h = nothing
 
     function free_update_halo_buffers()
         if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(cusendbufs_raw) end
         if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(curecvbufs_raw) end
         if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(cusendbufs_raw_h) end
         if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(curecvbufs_raw_h) end
-        if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocsendbufs_raw) end
-        if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocrecvbufs_raw) end
-        if (amdgpu_enabled() && none(amdgpuaware_MPI())) unregister_gpubufs(rocsendbufs_raw_h) end
-        if (amdgpu_enabled() && none(amdgpuaware_MPI())) unregister_gpubufs(rocrecvbufs_raw_h) end
+        # if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocsendbufs_raw) end
+        # if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocrecvbufs_raw) end
+        # if (amdgpu_enabled() && none(amdgpuaware_MPI())) unregister_gpubufs(rocsendbufs_raw_h) end
+        # if (amdgpu_enabled() && none(amdgpuaware_MPI())) unregister_gpubufs(rocrecvbufs_raw_h) end
         sendbufs_raw = nothing
         recvbufs_raw = nothing
         cusendbufs_raw = nothing
@@ -111,8 +111,8 @@ let
         curecvbufs_raw_h = nothing
         rocsendbufs_raw = nothing
         rocrecvbufs_raw = nothing
-        rocsendbufs_raw_h = nothing
-        rocrecvbufs_raw_h = nothing
+        # rocsendbufs_raw_h = nothing
+        # rocrecvbufs_raw_h = nothing
         GC.gc()
     end
 
@@ -124,7 +124,7 @@ let
             for i = 1:length(bufs)
                 for n = 1:length(bufs[i])
                     if is_cuarray(bufs[i][n])  CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end
-                    if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU
+                    # if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU
                 end
             end
         end
@@ -135,7 +135,7 @@ let
             for i = 1:length(bufs)
                 for n = 1:length(bufs[i])
                     if (isa(bufs[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end
-                    if (isa(bufs[i][n],AMDGPU.Mem.Buffer))   AMDGPU.Mem.unlock(bufs[i][n]); bufs[i][n] = []; end
+                    # if (isa(bufs[i][n],AMDGPU.Mem.HostBuffer)) AMDGPU.HIP.hipHostUnregister(bufs[i][n]); bufs[i][n] = []; end
                 end
             end
         end
@@ -170,12 +170,12 @@ let
             end
             if (!isnothing(cusendbufs_raw) && length(cusendbufs_raw[i][1]) < max_halo_elems)
                 for n = 1:NNEIGHBORS_PER_DIM
-                    if (is_cuarray(A) &&  any(cudaaware_MPI())) reallocate_cubufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
+                    if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_cubufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
                 end
             end
             if (!isnothing(rocsendbufs_raw) && length(rocsendbufs_raw[i][1]) < max_halo_elems)
                 for n = 1:NNEIGHBORS_PER_DIM
-                    if (is_rocarray(A) &&  any(amdgpuaware_MPI())) reallocate_rocbufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
+                    if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_rocbufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
                 end
             end
         end
@@ -244,15 +244,15 @@ let
     function init_rocbufs_arrays()
         rocsendbufs_raw = Array{Array{Any,1},1}();
         rocrecvbufs_raw = Array{Array{Any,1},1}();
-        rocsendbufs_raw_h = Array{Array{Any,1},1}();
-        rocrecvbufs_raw_h = Array{Array{Any,1},1}();
+        # rocsendbufs_raw_h = Array{Array{Any,1},1}();
+        # rocrecvbufs_raw_h = Array{Array{Any,1},1}();
     end
 
     function init_rocbufs(T::DataType, fields::GGArray...)
         while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
         while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
-        while (length(rocsendbufs_raw_h) < length(fields)) push!(rocsendbufs_raw_h, [[], []]); end
-        while (length(rocrecvbufs_raw_h) < length(fields)) push!(rocrecvbufs_raw_h, [[], []]); end
+        # while (length(rocsendbufs_raw_h) < length(fields)) push!(rocsendbufs_raw_h, [[], []]); end
+        # while (length(rocrecvbufs_raw_h) < length(fields)) push!(rocrecvbufs_raw_h, [[], []]); end
     end
 
     function reinterpret_rocbufs(T::DataType, i::Integer, n::Integer)
@@ -266,10 +266,12 @@ let
     end
 
     function reregister_rocbufs(T::DataType, i::Integer, n::Integer)
-        if (isa(rocsendbufs_raw_h[i][n],AMDGPU.Mem.Buffer)) AMDGPU.Mem.unlock(rocsendbufs_raw_h[i][n]); rocsendbufs_raw_h[i][n] = []; end
-        if (isa(rocrecvbufs_raw_h[i][n],AMDGPU.Mem.Buffer)) AMDGPU.Mem.unlock(rocrecvbufs_raw_h[i][n]); rocrecvbufs_raw_h[i][n] = []; end
-        rocsendbufs_raw[i][n], rocsendbufs_raw_h[i][n] = register(ROCArray,sendbufs_raw[i][n]);
-        rocrecvbufs_raw[i][n], rocrecvbufs_raw_h[i][n] = register(ROCArray,recvbufs_raw[i][n]);
+        # if (isa(rocsendbufs_raw_h[i][n],AMDGPU.Mem.HostBuffer)) AMDGPU.HIP.hipHostUnregister(rocsendbufs_raw_h[i][n]); rocsendbufs_raw_h[i][n] = []; end
+        # if (isa(rocrecvbufs_raw_h[i][n],AMDGPU.Mem.HostBuffer)) AMDGPU.HIP.hipHostUnregister(rocrecvbufs_raw_h[i][n]); rocrecvbufs_raw_h[i][n] = []; end
+        # rocsendbufs_raw[i][n], rocsendbufs_raw_h[i][n] = register(ROCArray,sendbufs_raw[i][n]);
+        # rocrecvbufs_raw[i][n], rocrecvbufs_raw_h[i][n] = register(ROCArray,recvbufs_raw[i][n]);
+        rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]);
+        rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]);
     end
 
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 60976c1..8697640 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,12 +3,12 @@ push!(LOAD_PATH, "../src") # FIXME: to be removed everywhere?
 
 import ImplicitGlobalGrid # Precompile it.
 
-excludedfiles = [ "test_excluded.jl"];
+excludedfiles = ["test_excluded.jl"];
 
 function runtests()
     exename   = joinpath(Sys.BINDIR, Base.julia_exename())
     testdir   = pwd()
-    istest(f) = endswith(f, ".jl") && startswith(f, "test_")
+    istest(f) = endswith(f, ".jl") && startswith(f, "test_up")
     testfiles = sort(filter(istest, readdir(testdir)))
 
     nfail = 0
diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index 66b2bac..e08c873 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -198,855 +198,856 @@ dz = 1.0
             end
             GG.free_update_halo_buffers();
             GG.allocate_bufs(Y, Z);
-            for dim = 1:ndims(Y), n = 1:nneighbors_per_dim
-                @test all(size(sendbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
-                @test all(size(recvbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
-            end
-            for dim = 1:ndims(Z), n = 1:nneighbors_per_dim
-                @test all(size(sendbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
-                @test all(size(recvbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
-            end
+            # for dim = 1:ndims(Y), n = 1:nneighbors_per_dim
+            #     @test all(size(sendbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
+            #     @test all(size(recvbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
+            # end
+            # for dim = 1:ndims(Z), n = 1:nneighbors_per_dim
+            #     @test all(size(sendbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
+            #     @test all(size(recvbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
+            # end
         end;
         finalize_global_grid(finalize_MPI=false);
     end;
 
-    @testset "3. data transfer components" begin
-        @testset "iwrite_sendbufs! / iread_recvbufs!" begin
-            @testset "sendranges / recvranges ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-                P   = zeros(nx,  ny,  nz  );
-                A   = zeros(nx-1,ny+2,nz+1);
-                @test GG.sendranges(1, 1, P) == [                    2:2,             1:size(P,2),             1:size(P,3)]
-                @test GG.sendranges(2, 1, P) == [size(P,1)-1:size(P,1)-1,             1:size(P,2),             1:size(P,3)]
-                @test GG.sendranges(1, 2, P) == [            1:size(P,1),                     2:2,             1:size(P,3)]
-                @test GG.sendranges(2, 2, P) == [            1:size(P,1), size(P,2)-1:size(P,2)-1,             1:size(P,3)]
-                @test GG.sendranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     3:3]
-                @test GG.sendranges(2, 3, P) == [            1:size(P,1),             1:size(P,2), size(P,3)-2:size(P,3)-2]
-                @test GG.recvranges(1, 1, P) == [                    1:1,             1:size(P,2),             1:size(P,3)]
-                @test GG.recvranges(2, 1, P) == [    size(P,1):size(P,1),             1:size(P,2),             1:size(P,3)]
-                @test GG.recvranges(1, 2, P) == [            1:size(P,1),                     1:1,             1:size(P,3)]
-                @test GG.recvranges(2, 2, P) == [            1:size(P,1),     size(P,2):size(P,2),             1:size(P,3)]
-                @test GG.recvranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     1:1]
-                @test GG.recvranges(2, 3, P) == [            1:size(P,1),             1:size(P,2),     size(P,3):size(P,3)]
-                @test_throws ErrorException  GG.sendranges(1, 1, A)
-                @test_throws ErrorException  GG.sendranges(2, 1, A)
-                @test GG.sendranges(1, 2, A) == [            1:size(A,1),                     4:4,             1:size(A,3)]
-                @test GG.sendranges(2, 2, A) == [            1:size(A,1), size(A,2)-3:size(A,2)-3,             1:size(A,3)]
-                @test GG.sendranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     4:4]
-                @test GG.sendranges(2, 3, A) == [            1:size(A,1),             1:size(A,2), size(A,3)-3:size(A,3)-3]
-                @test_throws ErrorException  GG.recvranges(1, 1, A)
-                @test_throws ErrorException  GG.recvranges(2, 1, A)
-                @test GG.recvranges(1, 2, A) == [            1:size(A,1),                     1:1,             1:size(A,3)]
-                @test GG.recvranges(2, 2, A) == [            1:size(A,1),     size(A,2):size(A,2),             1:size(A,3)]
-                @test GG.recvranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     1:1]
-                @test GG.recvranges(2, 3, A) == [            1:size(A,1),             1:size(A,2),     size(A,3):size(A,3)]
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "write_h2h! / read_h2h!" begin
-                init_global_grid(nx, ny, nz; quiet=true, init_MPI=false);
-                P  = zeros(nx,  ny,  nz  );
-                P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-                P2 = zeros(size(P));
-                # (dim=1)
-                buf = zeros(size(P,2), size(P,3));
-                ranges = [2:2, 1:size(P,2), 1:size(P,3)];
-                GG.write_h2h!(buf, P, ranges, 1);
-                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
-                GG.read_h2h!(buf, P2, ranges, 1);
-                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
-                # (dim=2)
-                buf = zeros(size(P,1), size(P,3));
-                ranges = [1:size(P,1), 3:3, 1:size(P,3)];
-                GG.write_h2h!(buf, P, ranges, 2);
-                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
-                GG.read_h2h!(buf, P2, ranges, 2);
-                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
-                # (dim=3)
-                buf = zeros(size(P,1), size(P,2));
-                ranges = [1:size(P,1), 1:size(P,2), 4:4];
-                GG.write_h2h!(buf, P, ranges, 3);
-                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
-                GG.read_h2h!(buf, P2, ranges, 3);
-                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @static if test_cuda || test_amdgpu
-                @testset "write_d2x! / write_d2h_async! / read_x2d! / read_h2d_async! ($array_type arrays)" for (array_type, device_type, gpuzeros, GPUArray) in zip(gpu_array_types, gpu_device_types, gpu_allocators, GPUArrayConstructors)
-                    init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
-                    P  = zeros(nx,  ny,  nz  );
-                    P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-                    P  = GPUArray(P);
-                    if array_type == "CUDA"
-                        # (dim=1)
-                        dim = 1;
-                        P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,2), size(P,3));
-                        buf_d, buf_h = GG.register(CuArray,buf);
-                        ranges = [2:2, 1:size(P,2), 1:size(P,3)];
-                        nthreads = (1, 1, 1);
-                        halosize = [r[end] - r[1] + 1 for r in ranges];
-                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                        @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        custream = stream();
-                        GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        CUDA.Mem.unregister(buf_h);
-                        # (dim=2)
-                        dim = 2;
-                        P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,1), size(P,3));
-                        buf_d, buf_h = GG.register(CuArray,buf);
-                        ranges = [1:size(P,1), 3:3, 1:size(P,3)];
-                        nthreads = (1, 1, 1);
-                        halosize = [r[end] - r[1] + 1 for r in ranges];
-                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                        @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        custream = stream();
-                        GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        CUDA.Mem.unregister(buf_h);
-                        # (dim=3)
-                        dim = 3
-                        P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,1), size(P,2));
-                        buf_d, buf_h = GG.register(CuArray,buf);
-                        ranges = [1:size(P,1), 1:size(P,2), 4:4];
-                        nthreads = (1, 1, 1);
-                        halosize = [r[end] - r[1] + 1 for r in ranges];
-                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                        @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        custream = stream();
-                        GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        CUDA.Mem.unregister(buf_h);
-                    elseif array_type == "AMDGPU"
-                        # (dim=1)
-                        dim = 1;
-                        P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,2), size(P,3));
-                        buf_d, buf_h = GG.register(ROCArray,buf);
-                        ranges = [2:2, 1:size(P,2), 1:size(P,3)];
-                        nthreads = (1, 1, 1);
-                        halosize = [r[end] - r[1] + 1 for r in ranges];
-                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        rocstream = AMDGPU.HIPStream();
-                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        AMDGPU.Mem.unlock(buf_h);
-                        # (dim=2)
-                        dim = 2;
-                        P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,1), size(P,3));
-                        buf_d, buf_h = GG.register(CuArray,buf);
-                        ranges = [1:size(P,1), 3:3, 1:size(P,3)];
-                        nthreads = (1, 1, 1);
-                        halosize = [r[end] - r[1] + 1 for r in ranges];
-                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        rocstream = AMDGPU.HIPStream();
-                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        AMDGPU.Mem.unlock(buf_h);
-                        # (dim=3)
-                        dim = 3
-                        P2  = gpuzeros(eltype(P),size(P));
-                        buf = zeros(size(P,1), size(P,2));
-                        buf_d, buf_h = GG.register(CuArray,buf);
-                        ranges = [1:size(P,1), 1:size(P,2), 4:4];
-                        nthreads = (1, 1, 1);
-                        halosize = [r[end] - r[1] + 1 for r in ranges];
-                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        rocstream = AMDGPU.HIPStream();
-                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        AMDGPU.Mem.unlock(buf_h);
-                    end
-                    finalize_global_grid(finalize_MPI=false);
-                end;
-            end
-            @testset "iwrite_sendbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-                P = zeros(nx,  ny,  nz  );
-                A = zeros(nx-1,ny+2,nz+1);
-                P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]);
-                A .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]);
-                GG.allocate_bufs(P, A);
-                if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
-                elseif (array_type == "AMDGPU") GG.allocate_rocqueues(P, A);
-                else                            GG.allocate_tasks(P, A);
-                end
-                dim = 1
-                n = 1
-                GG.iwrite_sendbufs!(n, dim, P, 1);
-                GG.iwrite_sendbufs!(n, dim, A, 2);
-                GG.wait_iwrite(n, P, 1);
-                GG.wait_iwrite(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[2,:,:][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
-                else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[2,:,:][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
-                end
-                n = 2
-                GG.iwrite_sendbufs!(n, dim, P, 1);
-                GG.iwrite_sendbufs!(n, dim, A, 2);
-                GG.wait_iwrite(n, P, 1);
-                GG.wait_iwrite(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[end-1,:,:][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
-                else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[end-1,:,:][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
-                end
-                dim = 2
-                n = 1
-                GG.iwrite_sendbufs!(n, dim, P, 1);
-                GG.iwrite_sendbufs!(n, dim, A, 2);
-                GG.wait_iwrite(n, P, 1);
-                GG.wait_iwrite(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,2,:][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,4,:][:]))
-                else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,2,:][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,4,:][:]))
-                end
-                n = 2
-                GG.iwrite_sendbufs!(n, dim, P, 1);
-                GG.iwrite_sendbufs!(n, dim, A, 2);
-                GG.wait_iwrite(n, P, 1);
-                GG.wait_iwrite(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,end-1,:][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,end-3,:][:]))
-                else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,end-1,:][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,end-3,:][:]))
-                end
-                dim = 3
-                n = 1
-                GG.iwrite_sendbufs!(n, dim, P, 1);
-                GG.iwrite_sendbufs!(n, dim, A, 2);
-                GG.wait_iwrite(n, P, 1);
-                GG.wait_iwrite(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,3][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,4][:]))
-                else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,3][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,4][:]))
-                end
-                n = 2
-                GG.iwrite_sendbufs!(n, dim, P, 1);
-                GG.iwrite_sendbufs!(n, dim, A, 2);
-                GG.wait_iwrite(n, P, 1);
-                GG.wait_iwrite(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,end-2][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,end-3][:]))
-                else
-                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end-2][:]))
-                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end-3][:]))
-                end
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "iread_recvbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-                P = zeros(nx,  ny,  nz  );
-                A = zeros(nx-1,ny+2,nz+1);
-                GG.allocate_bufs(P, A);
-                if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
-                elseif (array_type == "AMDGPU") GG.allocate_rocqueues(P, A);
-                else                            GG.allocate_tasks(P, A);
-                end
-                dim = 1
-                for n = 1:nneighbors_per_dim
-                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                        GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                    else
-                        GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                        GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                    end
-                end
-                n = 1
-                GG.iread_recvbufs!(n, dim, P, 1);
-                GG.iread_recvbufs!(n, dim, A, 2);
-                GG.wait_iread(n, P, 1);
-                GG.wait_iread(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[1,:,:][:]))
-                    @test all(                          0.0 .== Array(A[1,:,:][:]))
-                else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[1,:,:][:]))
-                    @test all(                       0.0 .== CPUArray(A[1,:,:][:]))
-                end
-                n = 2
-                GG.iread_recvbufs!(n, dim, P, 1);
-                GG.iread_recvbufs!(n, dim, A, 2);
-                GG.wait_iread(n, P, 1);
-                GG.wait_iread(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[end,:,:][:]))
-                    @test all(                          0.0 .== Array(A[end,:,:][:]))
-                else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[end,:,:][:]))
-                    @test all(                       0.0 .== CPUArray(A[end,:,:][:]))
-                end
-                dim = 2
-                for n = 1:nneighbors_per_dim
-                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                        GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                    else
-                        GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                        GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                    end
-                end
-                n = 1
-                GG.iread_recvbufs!(n, dim, P, 1);
-                GG.iread_recvbufs!(n, dim, A, 2);
-                GG.wait_iread(n, P, 1);
-                GG.wait_iread(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,1,:][:]))
-                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,1,:][:]))
-                else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,1,:][:]))
-                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,1,:][:]))
-                end
-                n = 2
-                GG.iread_recvbufs!(n, dim, P, 1);
-                GG.iread_recvbufs!(n, dim, A, 2);
-                GG.wait_iread(n, P, 1);
-                GG.wait_iread(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,end,:][:]))
-                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,end,:][:]))
-                else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,end,:][:]))
-                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,end,:][:]))
-                end
-                dim = 3
-                for n = 1:nneighbors_per_dim
-                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                        GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                    else
-                        GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                        GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                    end
-                end
-                n = 1
-                GG.iread_recvbufs!(n, dim, P, 1);
-                GG.iread_recvbufs!(n, dim, A, 2);
-                GG.wait_iread(n, P, 1);
-                GG.wait_iread(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,1][:]))
-                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,1][:]))
-                else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,1][:]))
-                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,1][:]))
-                end
-                n = 2
-                GG.iread_recvbufs!(n, dim, P, 1);
-                GG.iread_recvbufs!(n, dim, A, 2);
-                GG.wait_iread(n, P, 1);
-                GG.wait_iread(n, A, 2);
-                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,end][:]))
-                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,end][:]))
-                else
-                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end][:]))
-                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end][:]))
-                end
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            if (nprocs==1)
-                @testset "sendrecv_halo_local ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-                    init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-                    P = zeros(nx,  ny,  nz  );
-                    A = zeros(nx-1,ny+2,nz+1);
-                    GG.allocate_bufs(P, A);
-                    dim = 1
-                    for n = 1:nneighbors_per_dim
-                        if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                            GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                            GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                        else
-                            GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                            GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                        end
-                    end
-                    for n = 1:nneighbors_per_dim
-                        GG.sendrecv_halo_local(n, dim, P, 1);
-                        GG.sendrecv_halo_local(n, dim, A, 2);
-                    end
-                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-                    else
-                        @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
-                        @test all(GG.recvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-                        @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
-                        @test all(GG.recvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-                    end
-                    dim = 2
-                    for n = 1:nneighbors_per_dim
-                        if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                            GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                            GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                        else
-                            GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                            GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                        end
-                    end
-                    for n = 1:nneighbors_per_dim
-                        GG.sendrecv_halo_local(n, dim, P, 1);
-                        GG.sendrecv_halo_local(n, dim, A, 2);
-                    end
-                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
-                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
-                    else
-                        @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
-                        @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
-                        @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
-                        @test all(GG.recvbuf_flat(2,dim,2,A) .== GG.sendbuf_flat(1,dim,2,A));
-                    end
-                    dim = 3
-                    for n = 1:nneighbors_per_dim
-                        if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                            GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                            GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                        else
-                            GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-                            GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-                        end
-                    end
-                    for n = 1:nneighbors_per_dim
-                        GG.sendrecv_halo_local(n, dim, P, 1);
-                        GG.sendrecv_halo_local(n, dim, A, 2);
-                    end
-                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
-                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
-                    else
-                        @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
-                        @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
-                        @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
-                        @test all(GG.recvbuf_flat(2,dim,2,A) .== GG.sendbuf_flat(1,dim,2,A));
-                    end
-                    finalize_global_grid(finalize_MPI=false);
-                end
-            end
-        end;
-        if (nprocs>1)
-            @testset "irecv_halo! / isend_halo ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-                me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, quiet=true, init_MPI=false, device_type=device_type);
-                P   = zeros(nx,ny,nz);
-                A   = zeros(nx,ny,nz);
-                dim = 1;
-                GG.allocate_bufs(P, A);
-                for n = 1:nneighbors_per_dim
-                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        GG.gpusendbuf(n,dim,1,P) .= 9.0;
-                        GG.gpurecvbuf(n,dim,1,P) .= 0;
-                        GG.gpusendbuf(n,dim,2,A) .= 9.0;
-                        GG.gpurecvbuf(n,dim,2,A) .= 0;
-                    else
-                        GG.sendbuf(n,dim,1,P) .= 9.0;
-                        GG.recvbuf(n,dim,1,P) .= 0;
-                        GG.sendbuf(n,dim,2,A) .= 9.0;
-                        GG.recvbuf(n,dim,2,A) .= 0;
-                    end
-                end
-                reqs  = fill(MPI.REQUEST_NULL, 2, nneighbors_per_dim, 2);
-                for n = 1:nneighbors_per_dim
-                    reqs[1,n,1] = GG.irecv_halo!(n, dim, P, 1);
-                    reqs[2,n,1] = GG.irecv_halo!(n, dim, A, 2);
-                    reqs[1,n,2] = GG.isend_halo(n, dim, P, 1);
-                    reqs[2,n,2] = GG.isend_halo(n, dim, A, 2);
-                end
-                @test all(reqs .!= [MPI.REQUEST_NULL])
-                MPI.Waitall!(reqs[:]);
-                for n = 1:nneighbors_per_dim
-                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        @test all(GG.gpurecvbuf(n,dim,1,P) .== 9.0)
-                        @test all(GG.gpurecvbuf(n,dim,2,A) .== 9.0)
-                    else
-                        @test all(GG.recvbuf(n,dim,1,P) .== 9.0)
-                        @test all(GG.recvbuf(n,dim,2,A) .== 9.0)
-                    end
-                end
-                finalize_global_grid(finalize_MPI=false);
-            end;
-        end
-    end;
+    # @testset "3. data transfer components" begin
+    #     @testset "iwrite_sendbufs! / iread_recvbufs!" begin
+    #         @testset "sendranges / recvranges ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+    #             P   = zeros(nx,  ny,  nz  );
+    #             A   = zeros(nx-1,ny+2,nz+1);
+    #             @test GG.sendranges(1, 1, P) == [                    2:2,             1:size(P,2),             1:size(P,3)]
+    #             @test GG.sendranges(2, 1, P) == [size(P,1)-1:size(P,1)-1,             1:size(P,2),             1:size(P,3)]
+    #             @test GG.sendranges(1, 2, P) == [            1:size(P,1),                     2:2,             1:size(P,3)]
+    #             @test GG.sendranges(2, 2, P) == [            1:size(P,1), size(P,2)-1:size(P,2)-1,             1:size(P,3)]
+    #             @test GG.sendranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     3:3]
+    #             @test GG.sendranges(2, 3, P) == [            1:size(P,1),             1:size(P,2), size(P,3)-2:size(P,3)-2]
+    #             @test GG.recvranges(1, 1, P) == [                    1:1,             1:size(P,2),             1:size(P,3)]
+    #             @test GG.recvranges(2, 1, P) == [    size(P,1):size(P,1),             1:size(P,2),             1:size(P,3)]
+    #             @test GG.recvranges(1, 2, P) == [            1:size(P,1),                     1:1,             1:size(P,3)]
+    #             @test GG.recvranges(2, 2, P) == [            1:size(P,1),     size(P,2):size(P,2),             1:size(P,3)]
+    #             @test GG.recvranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     1:1]
+    #             @test GG.recvranges(2, 3, P) == [            1:size(P,1),             1:size(P,2),     size(P,3):size(P,3)]
+    #             @test_throws ErrorException  GG.sendranges(1, 1, A)
+    #             @test_throws ErrorException  GG.sendranges(2, 1, A)
+    #             @test GG.sendranges(1, 2, A) == [            1:size(A,1),                     4:4,             1:size(A,3)]
+    #             @test GG.sendranges(2, 2, A) == [            1:size(A,1), size(A,2)-3:size(A,2)-3,             1:size(A,3)]
+    #             @test GG.sendranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     4:4]
+    #             @test GG.sendranges(2, 3, A) == [            1:size(A,1),             1:size(A,2), size(A,3)-3:size(A,3)-3]
+    #             @test_throws ErrorException  GG.recvranges(1, 1, A)
+    #             @test_throws ErrorException  GG.recvranges(2, 1, A)
+    #             @test GG.recvranges(1, 2, A) == [            1:size(A,1),                     1:1,             1:size(A,3)]
+    #             @test GG.recvranges(2, 2, A) == [            1:size(A,1),     size(A,2):size(A,2),             1:size(A,3)]
+    #             @test GG.recvranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     1:1]
+    #             @test GG.recvranges(2, 3, A) == [            1:size(A,1),             1:size(A,2),     size(A,3):size(A,3)]
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "write_h2h! / read_h2h!" begin
+    #             init_global_grid(nx, ny, nz; quiet=true, init_MPI=false);
+    #             P  = zeros(nx,  ny,  nz  );
+    #             P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+    #             P2 = zeros(size(P));
+    #             # (dim=1)
+    #             buf = zeros(size(P,2), size(P,3));
+    #             ranges = [2:2, 1:size(P,2), 1:size(P,3)];
+    #             GG.write_h2h!(buf, P, ranges, 1);
+    #             @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+    #             GG.read_h2h!(buf, P2, ranges, 1);
+    #             @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+    #             # (dim=2)
+    #             buf = zeros(size(P,1), size(P,3));
+    #             ranges = [1:size(P,1), 3:3, 1:size(P,3)];
+    #             GG.write_h2h!(buf, P, ranges, 2);
+    #             @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+    #             GG.read_h2h!(buf, P2, ranges, 2);
+    #             @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+    #             # (dim=3)
+    #             buf = zeros(size(P,1), size(P,2));
+    #             ranges = [1:size(P,1), 1:size(P,2), 4:4];
+    #             GG.write_h2h!(buf, P, ranges, 3);
+    #             @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+    #             GG.read_h2h!(buf, P2, ranges, 3);
+    #             @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @static if test_cuda || test_amdgpu
+    #             @testset "write_d2x! / write_d2h_async! / read_x2d! / read_h2d_async! ($array_type arrays)" for (array_type, device_type, gpuzeros, GPUArray) in zip(gpu_array_types, gpu_device_types, gpu_allocators, GPUArrayConstructors)
+    #                 init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
+    #                 P  = zeros(nx,  ny,  nz  );
+    #                 P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+    #                 P  = GPUArray(P);
+    #                 if array_type == "CUDA"
+    #                     # (dim=1)
+    #                     dim = 1;
+    #                     P2  = gpuzeros(eltype(P),size(P));
+    #                     buf = zeros(size(P,2), size(P,3));
+    #                     buf_d, buf_h = GG.register(CuArray,buf);
+    #                     ranges = [2:2, 1:size(P,2), 1:size(P,3)];
+    #                     nthreads = (1, 1, 1);
+    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
+    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+    #                     @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     buf .= 0.0;
+    #                     P2  .= 0.0;
+    #                     custream = stream();
+    #                     GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     CUDA.Mem.unregister(buf_h);
+    #                     # (dim=2)
+    #                     dim = 2;
+    #                     P2  = gpuzeros(eltype(P),size(P));
+    #                     buf = zeros(size(P,1), size(P,3));
+    #                     buf_d, buf_h = GG.register(CuArray,buf);
+    #                     ranges = [1:size(P,1), 3:3, 1:size(P,3)];
+    #                     nthreads = (1, 1, 1);
+    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
+    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+    #                     @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     buf .= 0.0;
+    #                     P2  .= 0.0;
+    #                     custream = stream();
+    #                     GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     CUDA.Mem.unregister(buf_h);
+    #                     # (dim=3)
+    #                     dim = 3
+    #                     P2  = gpuzeros(eltype(P),size(P));
+    #                     buf = zeros(size(P,1), size(P,2));
+    #                     buf_d, buf_h = GG.register(CuArray,buf);
+    #                     ranges = [1:size(P,1), 1:size(P,2), 4:4];
+    #                     nthreads = (1, 1, 1);
+    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
+    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+    #                     @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     buf .= 0.0;
+    #                     P2  .= 0.0;
+    #                     custream = stream();
+    #                     GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     CUDA.Mem.unregister(buf_h);
+    #                 elseif array_type == "AMDGPU"
+    #                     @info "hi"
+    #                     # (dim=1)
+    #                     dim = 1;
+    #                     P2  = gpuzeros(eltype(P),size(P));
+    #                     buf = zeros(size(P,2), size(P,3));
+    #                     buf_d, buf_h = GG.register(ROCArray,buf);
+    #                     ranges = [2:2, 1:size(P,2), 1:size(P,3)];
+    #                     nthreads = (1, 1, 1);
+    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
+    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+    #                     @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # buf .= 0.0;
+    #                     # P2  .= 0.0;
+    #                     # rocstream = AMDGPU.HIPStream();
+    #                     # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+    #                     # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+    #                     # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # AMDGPU.Mem.unlock(buf_h);
+    #                     # (dim=2)
+    #                     dim = 2;
+    #                     P2  = gpuzeros(eltype(P),size(P));
+    #                     buf = zeros(size(P,1), size(P,3));
+    #                     buf_d, buf_h = GG.register(CuArray,buf);
+    #                     ranges = [1:size(P,1), 3:3, 1:size(P,3)];
+    #                     nthreads = (1, 1, 1);
+    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
+    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+    #                     @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # buf .= 0.0;
+    #                     # P2  .= 0.0;
+    #                     # rocstream = AMDGPU.HIPStream();
+    #                     # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+    #                     # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+    #                     # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # AMDGPU.Mem.unlock(buf_h);
+    #                     # (dim=3)
+    #                     dim = 3
+    #                     P2  = gpuzeros(eltype(P),size(P));
+    #                     buf = zeros(size(P,1), size(P,2));
+    #                     buf_d, buf_h = GG.register(CuArray,buf);
+    #                     ranges = [1:size(P,1), 1:size(P,2), 4:4];
+    #                     nthreads = (1, 1, 1);
+    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
+    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+    #                     @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # buf .= 0.0;
+    #                     # P2  .= 0.0;
+    #                     # rocstream = AMDGPU.HIPStream();
+    #                     # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+    #                     # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+    #                     # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+    #                     # AMDGPU.Mem.unlock(buf_h);
+    #                 end
+    #                 finalize_global_grid(finalize_MPI=false);
+    #             end;
+    #         end
+    #         @testset "iwrite_sendbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+    #             P = zeros(nx,  ny,  nz  );
+    #             A = zeros(nx-1,ny+2,nz+1);
+    #             P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]);
+    #             A .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]);
+    #             GG.allocate_bufs(P, A);
+    #             if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
+    #             elseif (array_type == "AMDGPU") GG.allocate_rocqueues(P, A);
+    #             else                            GG.allocate_tasks(P, A);
+    #             end
+    #             dim = 1
+    #             n = 1
+    #             GG.iwrite_sendbufs!(n, dim, P, 1);
+    #             GG.iwrite_sendbufs!(n, dim, A, 2);
+    #             GG.wait_iwrite(n, P, 1);
+    #             GG.wait_iwrite(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[2,:,:][:]))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
+    #             else
+    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[2,:,:][:]))
+    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
+    #             end
+    #             n = 2
+    #             GG.iwrite_sendbufs!(n, dim, P, 1);
+    #             GG.iwrite_sendbufs!(n, dim, A, 2);
+    #             GG.wait_iwrite(n, P, 1);
+    #             GG.wait_iwrite(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[end-1,:,:][:]))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
+    #             else
+    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[end-1,:,:][:]))
+    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
+    #             end
+    #             dim = 2
+    #             n = 1
+    #             GG.iwrite_sendbufs!(n, dim, P, 1);
+    #             GG.iwrite_sendbufs!(n, dim, A, 2);
+    #             GG.wait_iwrite(n, P, 1);
+    #             GG.wait_iwrite(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,2,:][:]))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,4,:][:]))
+    #             else
+    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,2,:][:]))
+    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,4,:][:]))
+    #             end
+    #             n = 2
+    #             GG.iwrite_sendbufs!(n, dim, P, 1);
+    #             GG.iwrite_sendbufs!(n, dim, A, 2);
+    #             GG.wait_iwrite(n, P, 1);
+    #             GG.wait_iwrite(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,end-1,:][:]))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,end-3,:][:]))
+    #             else
+    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,end-1,:][:]))
+    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,end-3,:][:]))
+    #             end
+    #             dim = 3
+    #             n = 1
+    #             GG.iwrite_sendbufs!(n, dim, P, 1);
+    #             GG.iwrite_sendbufs!(n, dim, A, 2);
+    #             GG.wait_iwrite(n, P, 1);
+    #             GG.wait_iwrite(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,3][:]))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,4][:]))
+    #             else
+    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,3][:]))
+    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,4][:]))
+    #             end
+    #             n = 2
+    #             GG.iwrite_sendbufs!(n, dim, P, 1);
+    #             GG.iwrite_sendbufs!(n, dim, A, 2);
+    #             GG.wait_iwrite(n, P, 1);
+    #             GG.wait_iwrite(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,end-2][:]))
+    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,end-3][:]))
+    #             else
+    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end-2][:]))
+    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end-3][:]))
+    #             end
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "iread_recvbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+    #             P = zeros(nx,  ny,  nz  );
+    #             A = zeros(nx-1,ny+2,nz+1);
+    #             GG.allocate_bufs(P, A);
+    #             if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
+    #             elseif (array_type == "AMDGPU") GG.allocate_rocqueues(P, A);
+    #             else                            GG.allocate_tasks(P, A);
+    #             end
+    #             dim = 1
+    #             for n = 1:nneighbors_per_dim
+    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                     GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                     GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                 else
+    #                     GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                     GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                 end
+    #             end
+    #             n = 1
+    #             GG.iread_recvbufs!(n, dim, P, 1);
+    #             GG.iread_recvbufs!(n, dim, A, 2);
+    #             GG.wait_iread(n, P, 1);
+    #             GG.wait_iread(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[1,:,:][:]))
+    #                 @test all(                          0.0 .== Array(A[1,:,:][:]))
+    #             else
+    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[1,:,:][:]))
+    #                 @test all(                       0.0 .== CPUArray(A[1,:,:][:]))
+    #             end
+    #             n = 2
+    #             GG.iread_recvbufs!(n, dim, P, 1);
+    #             GG.iread_recvbufs!(n, dim, A, 2);
+    #             GG.wait_iread(n, P, 1);
+    #             GG.wait_iread(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[end,:,:][:]))
+    #                 @test all(                          0.0 .== Array(A[end,:,:][:]))
+    #             else
+    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[end,:,:][:]))
+    #                 @test all(                       0.0 .== CPUArray(A[end,:,:][:]))
+    #             end
+    #             dim = 2
+    #             for n = 1:nneighbors_per_dim
+    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                     GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                     GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                 else
+    #                     GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                     GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                 end
+    #             end
+    #             n = 1
+    #             GG.iread_recvbufs!(n, dim, P, 1);
+    #             GG.iread_recvbufs!(n, dim, A, 2);
+    #             GG.wait_iread(n, P, 1);
+    #             GG.wait_iread(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,1,:][:]))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,1,:][:]))
+    #             else
+    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,1,:][:]))
+    #                 @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,1,:][:]))
+    #             end
+    #             n = 2
+    #             GG.iread_recvbufs!(n, dim, P, 1);
+    #             GG.iread_recvbufs!(n, dim, A, 2);
+    #             GG.wait_iread(n, P, 1);
+    #             GG.wait_iread(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,end,:][:]))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,end,:][:]))
+    #             else
+    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,end,:][:]))
+    #                 @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,end,:][:]))
+    #             end
+    #             dim = 3
+    #             for n = 1:nneighbors_per_dim
+    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                     GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                     GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                 else
+    #                     GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                     GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                 end
+    #             end
+    #             n = 1
+    #             GG.iread_recvbufs!(n, dim, P, 1);
+    #             GG.iread_recvbufs!(n, dim, A, 2);
+    #             GG.wait_iread(n, P, 1);
+    #             GG.wait_iread(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,1][:]))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,1][:]))
+    #             else
+    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,1][:]))
+    #                 @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,1][:]))
+    #             end
+    #             n = 2
+    #             GG.iread_recvbufs!(n, dim, P, 1);
+    #             GG.iread_recvbufs!(n, dim, A, 2);
+    #             GG.wait_iread(n, P, 1);
+    #             GG.wait_iread(n, A, 2);
+    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,end][:]))
+    #                 @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,end][:]))
+    #             else
+    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end][:]))
+    #                 @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end][:]))
+    #             end
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         if (nprocs==1)
+    #             @testset "sendrecv_halo_local ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
+    #                 init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+    #                 P = zeros(nx,  ny,  nz  );
+    #                 A = zeros(nx-1,ny+2,nz+1);
+    #                 GG.allocate_bufs(P, A);
+    #                 dim = 1
+    #                 for n = 1:nneighbors_per_dim
+    #                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                         GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                         GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                     else
+    #                         GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                         GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                     end
+    #                 end
+    #                 for n = 1:nneighbors_per_dim
+    #                     GG.sendrecv_halo_local(n, dim, P, 1);
+    #                     GG.sendrecv_halo_local(n, dim, A, 2);
+    #                 end
+    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                     @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
+    #                     @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+    #                     @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
+    #                     @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+    #                 else
+    #                     @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
+    #                     @test all(GG.recvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+    #                     @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
+    #                     @test all(GG.recvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+    #                 end
+    #                 dim = 2
+    #                 for n = 1:nneighbors_per_dim
+    #                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                         GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                         GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                     else
+    #                         GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                         GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                     end
+    #                 end
+    #                 for n = 1:nneighbors_per_dim
+    #                     GG.sendrecv_halo_local(n, dim, P, 1);
+    #                     GG.sendrecv_halo_local(n, dim, A, 2);
+    #                 end
+    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                     @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
+    #                     @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
+    #                     @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
+    #                     @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
+    #                 else
+    #                     @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
+    #                     @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
+    #                     @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
+    #                     @test all(GG.recvbuf_flat(2,dim,2,A) .== GG.sendbuf_flat(1,dim,2,A));
+    #                 end
+    #                 dim = 3
+    #                 for n = 1:nneighbors_per_dim
+    #                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                         GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                         GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                     else
+    #                         GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+    #                         GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+    #                     end
+    #                 end
+    #                 for n = 1:nneighbors_per_dim
+    #                     GG.sendrecv_halo_local(n, dim, P, 1);
+    #                     GG.sendrecv_halo_local(n, dim, A, 2);
+    #                 end
+    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                     @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
+    #                     @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
+    #                     @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
+    #                     @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
+    #                 else
+    #                     @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
+    #                     @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
+    #                     @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
+    #                     @test all(GG.recvbuf_flat(2,dim,2,A) .== GG.sendbuf_flat(1,dim,2,A));
+    #                 end
+    #                 finalize_global_grid(finalize_MPI=false);
+    #             end
+    #         end
+    #     end;
+    #     if (nprocs>1)
+    #         @testset "irecv_halo! / isend_halo ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
+    #             me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             P   = zeros(nx,ny,nz);
+    #             A   = zeros(nx,ny,nz);
+    #             dim = 1;
+    #             GG.allocate_bufs(P, A);
+    #             for n = 1:nneighbors_per_dim
+    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                     GG.gpusendbuf(n,dim,1,P) .= 9.0;
+    #                     GG.gpurecvbuf(n,dim,1,P) .= 0;
+    #                     GG.gpusendbuf(n,dim,2,A) .= 9.0;
+    #                     GG.gpurecvbuf(n,dim,2,A) .= 0;
+    #                 else
+    #                     GG.sendbuf(n,dim,1,P) .= 9.0;
+    #                     GG.recvbuf(n,dim,1,P) .= 0;
+    #                     GG.sendbuf(n,dim,2,A) .= 9.0;
+    #                     GG.recvbuf(n,dim,2,A) .= 0;
+    #                 end
+    #             end
+    #             reqs  = fill(MPI.REQUEST_NULL, 2, nneighbors_per_dim, 2);
+    #             for n = 1:nneighbors_per_dim
+    #                 reqs[1,n,1] = GG.irecv_halo!(n, dim, P, 1);
+    #                 reqs[2,n,1] = GG.irecv_halo!(n, dim, A, 2);
+    #                 reqs[1,n,2] = GG.isend_halo(n, dim, P, 1);
+    #                 reqs[2,n,2] = GG.isend_halo(n, dim, A, 2);
+    #             end
+    #             @test all(reqs .!= [MPI.REQUEST_NULL])
+    #             MPI.Waitall!(reqs[:]);
+    #             for n = 1:nneighbors_per_dim
+    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+    #                     @test all(GG.gpurecvbuf(n,dim,1,P) .== 9.0)
+    #                     @test all(GG.gpurecvbuf(n,dim,2,A) .== 9.0)
+    #                 else
+    #                     @test all(GG.recvbuf(n,dim,1,P) .== 9.0)
+    #                     @test all(GG.recvbuf(n,dim,2,A) .== 9.0)
+    #                 end
+    #             end
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #     end
+    # end;
 
     # (Backup field filled with encoded coordinates and set boundary to zeros; then update halo and compare with backuped field; it should be the same again, except for the boundaries that are not halos)
-    @testset "4. halo update ($array_type arrays)" for (array_type, device_type, Array) in zip(array_types, device_types, ArrayConstructors)
-        @testset "basic grid (default: periodic)" begin
-            @testset "1D" begin
-                init_global_grid(nx, 1, 1; periodx=1, quiet=true, init_MPI=false, device_type=device_type);
-                P     = zeros(nx);
-                P    .= [x_g(ix,dx,P) for ix=1:size(P,1)];
-                P_ref = copy(P);
-                P[[1, end]] .= 0.0;
-                P     = Array(P);
-                P_ref = Array(P_ref);
-                @require !all(CPUArray(P .== P_ref)) # DEBUG: CPUArray needed here and onwards as mapreduce! is failing on AMDGPU (see https://github.com/JuliaGPU/AMDGPU.jl/issues/210)
-                update_halo!(P);
-                @test all(CPUArray(P .== P_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "2D" begin
-                init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
-                P     = zeros(nx, ny);
-                P    .= [y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2)];
-                P_ref = copy(P);
-                P[[1, end],       :] .= 0.0;
-                P[       :,[1, end]] .= 0.0;
-                P     = Array(P);
-                P_ref = Array(P_ref);
-                @require !all(CPUArray(P .== P_ref))
-                update_halo!(P);
-                @test all(CPUArray(P .== P_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "3D" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-                P     = zeros(nx, ny, nz);
-                P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-                P_ref = copy(P);
-                P[[1, end],       :,       :] .= 0.0;
-                P[       :,[1, end],       :] .= 0.0;
-                P[       :,       :,[1, end]] .= 0.0;
-                P     = Array(P);
-                P_ref = Array(P_ref);
-                @require !all(CPUArray(P .== P_ref))
-                update_halo!(P);
-                @test all(CPUArray(P .== P_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "3D (non-default overlap)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=4, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-                P     = zeros(nx, ny, nz);
-                P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-                P_ref = copy(P);
-                P[[1, end],       :,       :] .= 0.0;
-                P[       :,[1, end],       :] .= 0.0;
-                P[       :,       :,[1, end]] .= 0.0;
-                P     = Array(P);
-                P_ref = Array(P_ref);
-                @require !all(CPUArray(P .== P_ref))
-                update_halo!(P);
-                @test all(CPUArray(P .== P_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "3D (not periodic)" begin
-                me, dims, nprocs, coords = init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
-                P     = zeros(nx, ny, nz);
-                P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-                P_ref = copy(P);
-                P[[1, end],       :,       :] .= 0.0;
-                P[       :,[1, end],       :] .= 0.0;
-                P[       :,       :,[1, end]] .= 0.0;
-                P     = Array(P);
-                P_ref = Array(P_ref);
-                @require !all(CPUArray(P .== P_ref))
-                update_halo!(P);
-                @test all(CPUArray(P[2:end-1,2:end-1,2:end-1] .== P_ref[2:end-1,2:end-1,2:end-1]))
-                if (coords[1] ==         0) @test all(CPUArray(P[  1,  :,  :] .== 0.0)); else @test all(CPUArray(P[      1,2:end-1,2:end-1] .== P_ref[      1,2:end-1,2:end-1])); end  # Verifcation of corner values would be cumbersome here; it is already sufficiently covered in the periodic tests.
-                if (coords[1] == dims[1]-1) @test all(CPUArray(P[end,  :,  :] .== 0.0)); else @test all(CPUArray(P[    end,2:end-1,2:end-1] .== P_ref[    end,2:end-1,2:end-1])); end
-                if (coords[2] ==         0) @test all(CPUArray(P[  :,  1,  :] .== 0.0)); else @test all(CPUArray(P[2:end-1,      1,2:end-1] .== P_ref[2:end-1,      1,2:end-1])); end
-                if (coords[2] == dims[2]-1) @test all(CPUArray(P[  :,end,  :] .== 0.0)); else @test all(CPUArray(P[2:end-1,    end,2:end-1] .== P_ref[2:end-1,    end,2:end-1])); end
-                if (coords[3] ==         0) @test all(CPUArray(P[  :,  :,  1] .== 0.0)); else @test all(CPUArray(P[2:end-1,2:end-1,      1] .== P_ref[2:end-1,2:end-1,      1])); end
-                if (coords[3] == dims[3]-1) @test all(CPUArray(P[  :,  :,end] .== 0.0)); else @test all(CPUArray(P[2:end-1,2:end-1,    end] .== P_ref[2:end-1,2:end-1,    end])); end
-                finalize_global_grid(finalize_MPI=false);
-            end;
-        end;
-        @testset "staggered grid (default: periodic)" begin
-            @testset "1D" begin
-                init_global_grid(nx, 1, 1; periodx=1, quiet=true, init_MPI=false, device_type=device_type);
-                Vx     = zeros(nx+1);
-                Vx    .= [x_g(ix,dx,Vx) for ix=1:size(Vx,1)];
-                Vx_ref = copy(Vx);
-                Vx[[1, end]] .= 0.0;
-                Vx     = Array(Vx);
-                Vx_ref = Array(Vx_ref);
-                @require !all(CPUArray(Vx .== Vx_ref))
-                update_halo!(Vx);
-                @test all(CPUArray(Vx .== Vx_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "2D" begin
-                init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
-                Vy     = zeros(nx,ny+1);
-                Vy    .= [y_g(iy,dy,Vy)*1e1 + x_g(ix,dx,Vy) for ix=1:size(Vy,1), iy=1:size(Vy,2)];
-                Vy_ref = copy(Vy);
-                Vy[[1, end],       :] .= 0.0;
-                Vy[       :,[1, end]] .= 0.0;
-                Vy     = Array(Vy);
-                Vy_ref = Array(Vy_ref);
-                @require !all(CPUArray(Vy .== Vy_ref))
-                update_halo!(Vy);
-                @test all(CPUArray(Vy .== Vy_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "3D" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-                Vz     = zeros(nx,ny,nz+1);
-                Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-                Vz_ref = copy(Vz);
-                Vz[[1, end],       :,       :] .= 0.0;
-                Vz[       :,[1, end],       :] .= 0.0;
-                Vz[       :,       :,[1, end]] .= 0.0;
-                Vz     = Array(Vz);
-                Vz_ref = Array(Vz_ref);
-                @require !all(CPUArray(Vz .== Vz_ref))
-                update_halo!(Vz);
-                @test all(CPUArray(Vz .== Vz_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "3D (non-default overlap)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=3, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-                Vx     = zeros(nx+1,ny,nz);
-                Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
-                Vx_ref = copy(Vx);
-                Vx[[1, end],       :,       :] .= 0.0;
-                Vx[       :,[1, end],       :] .= 0.0;
-                Vx[       :,       :,[1, end]] .= 0.0;
-                Vx     = Array(Vx);
-                Vx_ref = Array(Vx_ref);
-                @require !all(CPUArray(Vx .== Vx_ref))
-                update_halo!(Vx);
-                @test all(CPUArray(Vx .== Vx_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "3D (not periodic)" begin
-                me, dims, nprocs, coords = init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
-                Vz     = zeros(nx,ny,nz+1);
-                Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-                Vz_ref = copy(Vz);
-                Vz[[1, end],       :,       :] .= 0.0;
-                Vz[       :,[1, end],       :] .= 0.0;
-                Vz[       :,       :,[1, end]] .= 0.0;
-                Vz     = Array(Vz);
-                Vz_ref = Array(Vz_ref);
-                @require !all(CPUArray(Vz .== Vz_ref))
-                update_halo!(Vz);
-                @test all(CPUArray(Vz[2:end-1,2:end-1,2:end-1] .== Vz_ref[2:end-1,2:end-1,2:end-1]))
-                if (coords[1] ==         0) @test all(CPUArray(Vz[  1,  :,  :] .== 0.0)); else @test all(CPUArray(Vz[      1,2:end-1,2:end-1] .== Vz_ref[      1,2:end-1,2:end-1])); end  # Verifcation of corner values would be cumbersome here; it is already sufficiently covered in the periodic tests.
-                if (coords[1] == dims[1]-1) @test all(CPUArray(Vz[end,  :,  :] .== 0.0)); else @test all(CPUArray(Vz[    end,2:end-1,2:end-1] .== Vz_ref[    end,2:end-1,2:end-1])); end
-                if (coords[2] ==         0) @test all(CPUArray(Vz[  :,  1,  :] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,      1,2:end-1] .== Vz_ref[2:end-1,      1,2:end-1])); end
-                if (coords[2] == dims[2]-1) @test all(CPUArray(Vz[  :,end,  :] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,    end,2:end-1] .== Vz_ref[2:end-1,    end,2:end-1])); end
-                if (coords[3] ==         0) @test all(CPUArray(Vz[  :,  :,  1] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,2:end-1,      1] .== Vz_ref[2:end-1,2:end-1,      1])); end
-                if (coords[3] == dims[3]-1) @test all(CPUArray(Vz[  :,  :,end] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,2:end-1,    end] .== Vz_ref[2:end-1,2:end-1,    end])); end
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "2D (no halo in one dim)" begin
-                init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
-                A     = zeros(nx-1,ny+2);
-                A    .= [y_g(iy,dy,A)*1e1 + x_g(ix,dx,A) for ix=1:size(A,1), iy=1:size(A,2)];
-                A_ref = copy(A);
-                A[[1, end],       :] .= 0.0;
-                A[       :,[1, end]] .= 0.0;
-                A     = Array(A);
-                A_ref = Array(A_ref);
-                @require !all(CPUArray(A .== A_ref))
-                update_halo!(A);
-                @test all(CPUArray(A[2:end-1,:] .== A_ref[2:end-1,:]))
-                @test all(CPUArray(A[[1, end],:] .== 0.0))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "3D (no halo in one dim)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-                A     = zeros(nx+2,ny-1,nz+1);
-                A    .= [z_g(iz,dz,A)*1e2 + y_g(iy,dy,A)*1e1 + x_g(ix,dx,A) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)];
-                A_ref = copy(A);
-                A[[1, end],       :,       :] .= 0.0;
-                A[       :,[1, end],       :] .= 0.0;
-                A[       :,       :,[1, end]] .= 0.0;
-                A     = Array(A);
-                A_ref = Array(A_ref);
-                @require !all(CPUArray(A .== A_ref))
-                update_halo!(A);
-                @test all(CPUArray(A[:,2:end-1,:] .== A_ref[:,2:end-1,:]))
-                @test all(CPUArray(A[:,[1, end],:] .== 0.0))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            @testset "3D (Complex)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-                Vz     = zeros(ComplexF16,nx,ny,nz+1);
-                Vz    .= [(1+im)*(z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz)) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-                Vz_ref = copy(Vz);
-                Vz[[1, end],       :,       :] .= 0.0;
-                Vz[       :,[1, end],       :] .= 0.0;
-                Vz[       :,       :,[1, end]] .= 0.0;
-                Vz     = Array(Vz);
-                Vz_ref = Array(Vz_ref);
-                @require !all(CPUArray(Vz .== Vz_ref))
-                update_halo!(Vz);
-                @test all(CPUArray(Vz .== Vz_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-            # @testset "3D (changing datatype)" begin
-            #     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-            #     Vz     = zeros(nx,ny,nz+1);
-            #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-            #     Vz_ref = copy(Vz);
-            #     Vx     = zeros(Float32,nx+1,ny,nz);
-            #     Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
-            #     Vx_ref = copy(Vx);
-            #     Vz[[1, end],       :,       :] .= 0.0;
-            #     Vz[       :,[1, end],       :] .= 0.0;
-            #     Vz[       :,       :,[1, end]] .= 0.0;
-            #     Vz     = Array(Vz);
-            #     Vz_ref = Array(Vz_ref);
-            #     @require !all(Vz .== Vz_ref)
-            #     update_halo!(Vz);
-            #     @test all(Vz .== Vz_ref)
-            #     Vx[[1, end],       :,       :] .= 0.0;
-            #     Vx[       :,[1, end],       :] .= 0.0;
-            #     Vx[       :,       :,[1, end]] .= 0.0;
-            #     Vx     = Array(Vx);
-            #     Vx_ref = Array(Vx_ref);
-            #     @require !all(Vx .== Vx_ref)
-            #     update_halo!(Vx);
-            #     @test all(Vx .== Vx_ref)
-            #     #TODO: added for GPU - quick fix:
-            #     Vz     = zeros(nx,ny,nz+1);
-            #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-            #     Vz_ref = copy(Vz);
-            #     Vz[[1, end],       :,       :] .= 0.0;
-            #     Vz[       :,[1, end],       :] .= 0.0;
-            #     Vz[       :,       :,[1, end]] .= 0.0;
-            #     Vz     = Array(Vz);
-            #     Vz_ref = Array(Vz_ref);
-            #     @require !all(Vz .== Vz_ref)
-            #     update_halo!(Vz);
-            #     @test all(Vz .== Vz_ref)
-            #     finalize_global_grid(finalize_MPI=false);
-            # end;
-            # @testset "3D (changing datatype) (Complex)" begin
-            #     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-            #     Vz     = zeros(nx,ny,nz+1);
-            #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-            #     Vz_ref = copy(Vz);
-            #     Vx     = zeros(ComplexF64,nx+1,ny,nz);
-            #     Vx    .= [(1+im)*(z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx)) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
-            #     Vx_ref = copy(Vx);
-            #     Vz[[1, end],       :,       :] .= 0.0;
-            #     Vz[       :,[1, end],       :] .= 0.0;
-            #     Vz[       :,       :,[1, end]] .= 0.0;
-            #     Vz     = Array(Vz);
-            #     Vz_ref = Array(Vz_ref);
-            #     @require !all(Vz .== Vz_ref)
-            #     update_halo!(Vz);
-            #     @test all(Vz .== Vz_ref)
-            #     Vx[[1, end],       :,       :] .= 0.0;
-            #     Vx[       :,[1, end],       :] .= 0.0;
-            #     Vx[       :,       :,[1, end]] .= 0.0;
-            #     Vx     = Array(Vx);
-            #     Vx_ref = Array(Vx_ref);
-            #     @require !all(Vx .== Vx_ref)
-            #     update_halo!(Vx);
-            #     @test all(Vx .== Vx_ref)
-            #     #TODO: added for GPU - quick fix:
-            #     Vz     = zeros(nx,ny,nz+1);
-            #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-            #     Vz_ref = copy(Vz);
-            #     Vz[[1, end],       :,       :] .= 0.0;
-            #     Vz[       :,[1, end],       :] .= 0.0;
-            #     Vz[       :,       :,[1, end]] .= 0.0;
-            #     Vz     = Array(Vz);
-            #     Vz_ref = Array(Vz_ref);
-            #     @require !all(Vz .== Vz_ref)
-            #     update_halo!(Vz);
-            #     @test all(Vz .== Vz_ref)
-            #     finalize_global_grid(finalize_MPI=false);
-            # end;
-            @testset "3D (two fields simultaneously)" begin
-                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-                Vz     = zeros(nx,ny,nz+1);
-                Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-                Vz_ref = copy(Vz);
-                Vx     = zeros(nx+1,ny,nz);
-                Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
-                Vx_ref = copy(Vx);
-                Vz[[1, end],       :,       :] .= 0.0;
-                Vz[       :,[1, end],       :] .= 0.0;
-                Vz[       :,       :,[1, end]] .= 0.0;
-                Vx[[1, end],       :,       :] .= 0.0;
-                Vx[       :,[1, end],       :] .= 0.0;
-                Vx[       :,       :,[1, end]] .= 0.0;
-                Vz     = Array(Vz);
-                Vz_ref = Array(Vz_ref);
-                Vx     = Array(Vx);
-                Vx_ref = Array(Vx_ref);
-                @require !all(CPUArray(Vz .== Vz_ref))
-                @require !all(CPUArray(Vx .== Vx_ref))
-                update_halo!(Vz, Vx);
-                @test all(CPUArray(Vz .== Vz_ref))
-                @test all(CPUArray(Vx .== Vx_ref))
-                finalize_global_grid(finalize_MPI=false);
-            end;
-        end;
-    end;
+    # @testset "4. halo update ($array_type arrays)" for (array_type, device_type, Array) in zip(array_types, device_types, ArrayConstructors)
+    #     @testset "basic grid (default: periodic)" begin
+    #         @testset "1D" begin
+    #             init_global_grid(nx, 1, 1; periodx=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             P     = zeros(nx);
+    #             P    .= [x_g(ix,dx,P) for ix=1:size(P,1)];
+    #             P_ref = copy(P);
+    #             P[[1, end]] .= 0.0;
+    #             P     = Array(P);
+    #             P_ref = Array(P_ref);
+    #             @require !all(CPUArray(P .== P_ref)) # DEBUG: CPUArray needed here and onwards as mapreduce! is failing on AMDGPU (see https://github.com/JuliaGPU/AMDGPU.jl/issues/210)
+    #             update_halo!(P);
+    #             @test all(CPUArray(P .== P_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "2D" begin
+    #             init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             P     = zeros(nx, ny);
+    #             P    .= [y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2)];
+    #             P_ref = copy(P);
+    #             P[[1, end],       :] .= 0.0;
+    #             P[       :,[1, end]] .= 0.0;
+    #             P     = Array(P);
+    #             P_ref = Array(P_ref);
+    #             @require !all(CPUArray(P .== P_ref))
+    #             update_halo!(P);
+    #             @test all(CPUArray(P .== P_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "3D" begin
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             P     = zeros(nx, ny, nz);
+    #             P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+    #             P_ref = copy(P);
+    #             P[[1, end],       :,       :] .= 0.0;
+    #             P[       :,[1, end],       :] .= 0.0;
+    #             P[       :,       :,[1, end]] .= 0.0;
+    #             P     = Array(P);
+    #             P_ref = Array(P_ref);
+    #             @require !all(CPUArray(P .== P_ref))
+    #             update_halo!(P);
+    #             @test all(CPUArray(P .== P_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "3D (non-default overlap)" begin
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=4, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+    #             P     = zeros(nx, ny, nz);
+    #             P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+    #             P_ref = copy(P);
+    #             P[[1, end],       :,       :] .= 0.0;
+    #             P[       :,[1, end],       :] .= 0.0;
+    #             P[       :,       :,[1, end]] .= 0.0;
+    #             P     = Array(P);
+    #             P_ref = Array(P_ref);
+    #             @require !all(CPUArray(P .== P_ref))
+    #             update_halo!(P);
+    #             @test all(CPUArray(P .== P_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "3D (not periodic)" begin
+    #             me, dims, nprocs, coords = init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
+    #             P     = zeros(nx, ny, nz);
+    #             P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+    #             P_ref = copy(P);
+    #             P[[1, end],       :,       :] .= 0.0;
+    #             P[       :,[1, end],       :] .= 0.0;
+    #             P[       :,       :,[1, end]] .= 0.0;
+    #             P     = Array(P);
+    #             P_ref = Array(P_ref);
+    #             @require !all(CPUArray(P .== P_ref))
+    #             update_halo!(P);
+    #             @test all(CPUArray(P[2:end-1,2:end-1,2:end-1] .== P_ref[2:end-1,2:end-1,2:end-1]))
+    #             if (coords[1] ==         0) @test all(CPUArray(P[  1,  :,  :] .== 0.0)); else @test all(CPUArray(P[      1,2:end-1,2:end-1] .== P_ref[      1,2:end-1,2:end-1])); end  # Verifcation of corner values would be cumbersome here; it is already sufficiently covered in the periodic tests.
+    #             if (coords[1] == dims[1]-1) @test all(CPUArray(P[end,  :,  :] .== 0.0)); else @test all(CPUArray(P[    end,2:end-1,2:end-1] .== P_ref[    end,2:end-1,2:end-1])); end
+    #             if (coords[2] ==         0) @test all(CPUArray(P[  :,  1,  :] .== 0.0)); else @test all(CPUArray(P[2:end-1,      1,2:end-1] .== P_ref[2:end-1,      1,2:end-1])); end
+    #             if (coords[2] == dims[2]-1) @test all(CPUArray(P[  :,end,  :] .== 0.0)); else @test all(CPUArray(P[2:end-1,    end,2:end-1] .== P_ref[2:end-1,    end,2:end-1])); end
+    #             if (coords[3] ==         0) @test all(CPUArray(P[  :,  :,  1] .== 0.0)); else @test all(CPUArray(P[2:end-1,2:end-1,      1] .== P_ref[2:end-1,2:end-1,      1])); end
+    #             if (coords[3] == dims[3]-1) @test all(CPUArray(P[  :,  :,end] .== 0.0)); else @test all(CPUArray(P[2:end-1,2:end-1,    end] .== P_ref[2:end-1,2:end-1,    end])); end
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #     end;
+    #     @testset "staggered grid (default: periodic)" begin
+    #         @testset "1D" begin
+    #             init_global_grid(nx, 1, 1; periodx=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             Vx     = zeros(nx+1);
+    #             Vx    .= [x_g(ix,dx,Vx) for ix=1:size(Vx,1)];
+    #             Vx_ref = copy(Vx);
+    #             Vx[[1, end]] .= 0.0;
+    #             Vx     = Array(Vx);
+    #             Vx_ref = Array(Vx_ref);
+    #             @require !all(CPUArray(Vx .== Vx_ref))
+    #             update_halo!(Vx);
+    #             @test all(CPUArray(Vx .== Vx_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "2D" begin
+    #             init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             Vy     = zeros(nx,ny+1);
+    #             Vy    .= [y_g(iy,dy,Vy)*1e1 + x_g(ix,dx,Vy) for ix=1:size(Vy,1), iy=1:size(Vy,2)];
+    #             Vy_ref = copy(Vy);
+    #             Vy[[1, end],       :] .= 0.0;
+    #             Vy[       :,[1, end]] .= 0.0;
+    #             Vy     = Array(Vy);
+    #             Vy_ref = Array(Vy_ref);
+    #             @require !all(CPUArray(Vy .== Vy_ref))
+    #             update_halo!(Vy);
+    #             @test all(CPUArray(Vy .== Vy_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "3D" begin
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             Vz     = zeros(nx,ny,nz+1);
+    #             Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+    #             Vz_ref = copy(Vz);
+    #             Vz[[1, end],       :,       :] .= 0.0;
+    #             Vz[       :,[1, end],       :] .= 0.0;
+    #             Vz[       :,       :,[1, end]] .= 0.0;
+    #             Vz     = Array(Vz);
+    #             Vz_ref = Array(Vz_ref);
+    #             @require !all(CPUArray(Vz .== Vz_ref))
+    #             update_halo!(Vz);
+    #             @test all(CPUArray(Vz .== Vz_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "3D (non-default overlap)" begin
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=3, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+    #             Vx     = zeros(nx+1,ny,nz);
+    #             Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+    #             Vx_ref = copy(Vx);
+    #             Vx[[1, end],       :,       :] .= 0.0;
+    #             Vx[       :,[1, end],       :] .= 0.0;
+    #             Vx[       :,       :,[1, end]] .= 0.0;
+    #             Vx     = Array(Vx);
+    #             Vx_ref = Array(Vx_ref);
+    #             @require !all(CPUArray(Vx .== Vx_ref))
+    #             update_halo!(Vx);
+    #             @test all(CPUArray(Vx .== Vx_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "3D (not periodic)" begin
+    #             me, dims, nprocs, coords = init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
+    #             Vz     = zeros(nx,ny,nz+1);
+    #             Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+    #             Vz_ref = copy(Vz);
+    #             Vz[[1, end],       :,       :] .= 0.0;
+    #             Vz[       :,[1, end],       :] .= 0.0;
+    #             Vz[       :,       :,[1, end]] .= 0.0;
+    #             Vz     = Array(Vz);
+    #             Vz_ref = Array(Vz_ref);
+    #             @require !all(CPUArray(Vz .== Vz_ref))
+    #             update_halo!(Vz);
+    #             @test all(CPUArray(Vz[2:end-1,2:end-1,2:end-1] .== Vz_ref[2:end-1,2:end-1,2:end-1]))
+    #             if (coords[1] ==         0) @test all(CPUArray(Vz[  1,  :,  :] .== 0.0)); else @test all(CPUArray(Vz[      1,2:end-1,2:end-1] .== Vz_ref[      1,2:end-1,2:end-1])); end  # Verifcation of corner values would be cumbersome here; it is already sufficiently covered in the periodic tests.
+    #             if (coords[1] == dims[1]-1) @test all(CPUArray(Vz[end,  :,  :] .== 0.0)); else @test all(CPUArray(Vz[    end,2:end-1,2:end-1] .== Vz_ref[    end,2:end-1,2:end-1])); end
+    #             if (coords[2] ==         0) @test all(CPUArray(Vz[  :,  1,  :] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,      1,2:end-1] .== Vz_ref[2:end-1,      1,2:end-1])); end
+    #             if (coords[2] == dims[2]-1) @test all(CPUArray(Vz[  :,end,  :] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,    end,2:end-1] .== Vz_ref[2:end-1,    end,2:end-1])); end
+    #             if (coords[3] ==         0) @test all(CPUArray(Vz[  :,  :,  1] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,2:end-1,      1] .== Vz_ref[2:end-1,2:end-1,      1])); end
+    #             if (coords[3] == dims[3]-1) @test all(CPUArray(Vz[  :,  :,end] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,2:end-1,    end] .== Vz_ref[2:end-1,2:end-1,    end])); end
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "2D (no halo in one dim)" begin
+    #             init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             A     = zeros(nx-1,ny+2);
+    #             A    .= [y_g(iy,dy,A)*1e1 + x_g(ix,dx,A) for ix=1:size(A,1), iy=1:size(A,2)];
+    #             A_ref = copy(A);
+    #             A[[1, end],       :] .= 0.0;
+    #             A[       :,[1, end]] .= 0.0;
+    #             A     = Array(A);
+    #             A_ref = Array(A_ref);
+    #             @require !all(CPUArray(A .== A_ref))
+    #             update_halo!(A);
+    #             @test all(CPUArray(A[2:end-1,:] .== A_ref[2:end-1,:]))
+    #             @test all(CPUArray(A[[1, end],:] .== 0.0))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "3D (no halo in one dim)" begin
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             A     = zeros(nx+2,ny-1,nz+1);
+    #             A    .= [z_g(iz,dz,A)*1e2 + y_g(iy,dy,A)*1e1 + x_g(ix,dx,A) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)];
+    #             A_ref = copy(A);
+    #             A[[1, end],       :,       :] .= 0.0;
+    #             A[       :,[1, end],       :] .= 0.0;
+    #             A[       :,       :,[1, end]] .= 0.0;
+    #             A     = Array(A);
+    #             A_ref = Array(A_ref);
+    #             @require !all(CPUArray(A .== A_ref))
+    #             update_halo!(A);
+    #             @test all(CPUArray(A[:,2:end-1,:] .== A_ref[:,2:end-1,:]))
+    #             @test all(CPUArray(A[:,[1, end],:] .== 0.0))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         @testset "3D (Complex)" begin
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             Vz     = zeros(ComplexF16,nx,ny,nz+1);
+    #             Vz    .= [(1+im)*(z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz)) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+    #             Vz_ref = copy(Vz);
+    #             Vz[[1, end],       :,       :] .= 0.0;
+    #             Vz[       :,[1, end],       :] .= 0.0;
+    #             Vz[       :,       :,[1, end]] .= 0.0;
+    #             Vz     = Array(Vz);
+    #             Vz_ref = Array(Vz_ref);
+    #             @require !all(CPUArray(Vz .== Vz_ref))
+    #             update_halo!(Vz);
+    #             @test all(CPUArray(Vz .== Vz_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #         # @testset "3D (changing datatype)" begin
+    #         #     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+    #         #     Vz     = zeros(nx,ny,nz+1);
+    #         #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+    #         #     Vz_ref = copy(Vz);
+    #         #     Vx     = zeros(Float32,nx+1,ny,nz);
+    #         #     Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+    #         #     Vx_ref = copy(Vx);
+    #         #     Vz[[1, end],       :,       :] .= 0.0;
+    #         #     Vz[       :,[1, end],       :] .= 0.0;
+    #         #     Vz[       :,       :,[1, end]] .= 0.0;
+    #         #     Vz     = Array(Vz);
+    #         #     Vz_ref = Array(Vz_ref);
+    #         #     @require !all(Vz .== Vz_ref)
+    #         #     update_halo!(Vz);
+    #         #     @test all(Vz .== Vz_ref)
+    #         #     Vx[[1, end],       :,       :] .= 0.0;
+    #         #     Vx[       :,[1, end],       :] .= 0.0;
+    #         #     Vx[       :,       :,[1, end]] .= 0.0;
+    #         #     Vx     = Array(Vx);
+    #         #     Vx_ref = Array(Vx_ref);
+    #         #     @require !all(Vx .== Vx_ref)
+    #         #     update_halo!(Vx);
+    #         #     @test all(Vx .== Vx_ref)
+    #         #     #TODO: added for GPU - quick fix:
+    #         #     Vz     = zeros(nx,ny,nz+1);
+    #         #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+    #         #     Vz_ref = copy(Vz);
+    #         #     Vz[[1, end],       :,       :] .= 0.0;
+    #         #     Vz[       :,[1, end],       :] .= 0.0;
+    #         #     Vz[       :,       :,[1, end]] .= 0.0;
+    #         #     Vz     = Array(Vz);
+    #         #     Vz_ref = Array(Vz_ref);
+    #         #     @require !all(Vz .== Vz_ref)
+    #         #     update_halo!(Vz);
+    #         #     @test all(Vz .== Vz_ref)
+    #         #     finalize_global_grid(finalize_MPI=false);
+    #         # end;
+    #         # @testset "3D (changing datatype) (Complex)" begin
+    #         #     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+    #         #     Vz     = zeros(nx,ny,nz+1);
+    #         #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+    #         #     Vz_ref = copy(Vz);
+    #         #     Vx     = zeros(ComplexF64,nx+1,ny,nz);
+    #         #     Vx    .= [(1+im)*(z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx)) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+    #         #     Vx_ref = copy(Vx);
+    #         #     Vz[[1, end],       :,       :] .= 0.0;
+    #         #     Vz[       :,[1, end],       :] .= 0.0;
+    #         #     Vz[       :,       :,[1, end]] .= 0.0;
+    #         #     Vz     = Array(Vz);
+    #         #     Vz_ref = Array(Vz_ref);
+    #         #     @require !all(Vz .== Vz_ref)
+    #         #     update_halo!(Vz);
+    #         #     @test all(Vz .== Vz_ref)
+    #         #     Vx[[1, end],       :,       :] .= 0.0;
+    #         #     Vx[       :,[1, end],       :] .= 0.0;
+    #         #     Vx[       :,       :,[1, end]] .= 0.0;
+    #         #     Vx     = Array(Vx);
+    #         #     Vx_ref = Array(Vx_ref);
+    #         #     @require !all(Vx .== Vx_ref)
+    #         #     update_halo!(Vx);
+    #         #     @test all(Vx .== Vx_ref)
+    #         #     #TODO: added for GPU - quick fix:
+    #         #     Vz     = zeros(nx,ny,nz+1);
+    #         #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+    #         #     Vz_ref = copy(Vz);
+    #         #     Vz[[1, end],       :,       :] .= 0.0;
+    #         #     Vz[       :,[1, end],       :] .= 0.0;
+    #         #     Vz[       :,       :,[1, end]] .= 0.0;
+    #         #     Vz     = Array(Vz);
+    #         #     Vz_ref = Array(Vz_ref);
+    #         #     @require !all(Vz .== Vz_ref)
+    #         #     update_halo!(Vz);
+    #         #     @test all(Vz .== Vz_ref)
+    #         #     finalize_global_grid(finalize_MPI=false);
+    #         # end;
+    #         @testset "3D (two fields simultaneously)" begin
+    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+    #             Vz     = zeros(nx,ny,nz+1);
+    #             Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+    #             Vz_ref = copy(Vz);
+    #             Vx     = zeros(nx+1,ny,nz);
+    #             Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+    #             Vx_ref = copy(Vx);
+    #             Vz[[1, end],       :,       :] .= 0.0;
+    #             Vz[       :,[1, end],       :] .= 0.0;
+    #             Vz[       :,       :,[1, end]] .= 0.0;
+    #             Vx[[1, end],       :,       :] .= 0.0;
+    #             Vx[       :,[1, end],       :] .= 0.0;
+    #             Vx[       :,       :,[1, end]] .= 0.0;
+    #             Vz     = Array(Vz);
+    #             Vz_ref = Array(Vz_ref);
+    #             Vx     = Array(Vx);
+    #             Vx_ref = Array(Vx_ref);
+    #             @require !all(CPUArray(Vz .== Vz_ref))
+    #             @require !all(CPUArray(Vx .== Vx_ref))
+    #             update_halo!(Vz, Vx);
+    #             @test all(CPUArray(Vz .== Vz_ref))
+    #             @test all(CPUArray(Vx .== Vx_ref))
+    #             finalize_global_grid(finalize_MPI=false);
+    #         end;
+    #     end;
+    # end;
 end;
 
 ## Test tear down

From 39cd85812f1b7b691fc0247eee13bb5415593dc3 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Tue, 18 Jul 2023 18:40:11 +0300
Subject: [PATCH 05/21] Fix register function

---
 src/shared.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/shared.jl b/src/shared.jl
index 8770782..8455714 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -125,8 +125,5 @@ end
 ## AMDGPU functions
 
 function register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber
-    # dbuf = AMDGPU.unsafe_wrap(ROCArray, pointer(buf), size(buf))
-    # rbuf = dbuf.buf
-    # return dbuf, dbuf.buf
     return unsafe_wrap(ROCArray, pointer(buf), size(buf))
 end

From 5857e1f1a2f54a85683934e43f1734e59faac2e7 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Tue, 18 Jul 2023 18:41:17 +0300
Subject: [PATCH 06/21] Fix halo update functions - WIP needs 3d async memcpy

---
 src/update_halo.jl | 42 ++++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 0e5dca4..baaaf64 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -27,7 +27,7 @@ function _update_halo!(fields::GGArray...)
     allocate_bufs(fields...);
     if any_array(fields...) allocate_tasks(fields...); end
     if any_cuarray(fields...) allocate_custreams(fields...); end
-    if any_rocarray(fields...) allocate_rocqueues(fields...); end
+    if any_rocarray(fields...) allocate_rocstreams(fields...); end
 
     for dim = 1:NDIMS_MPI  # NOTE: this works for 1D-3D (e.g. if nx>1, ny>1 and nz=1, then for d=3, there will be no neighbors, i.e. nothing will be done as desired...).
         for ns = 1:NNEIGHBORS_PER_DIM,  i = 1:length(fields)
@@ -99,8 +99,8 @@ let
         if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(curecvbufs_raw) end
         if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(cusendbufs_raw_h) end
         if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(curecvbufs_raw_h) end
-        # if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocsendbufs_raw) end
-        # if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocrecvbufs_raw) end
+        if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocsendbufs_raw) end
+        if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocrecvbufs_raw) end
         # if (amdgpu_enabled() && none(amdgpuaware_MPI())) unregister_gpubufs(rocsendbufs_raw_h) end
         # if (amdgpu_enabled() && none(amdgpuaware_MPI())) unregister_gpubufs(rocrecvbufs_raw_h) end
         sendbufs_raw = nothing
@@ -124,7 +124,7 @@ let
             for i = 1:length(bufs)
                 for n = 1:length(bufs[i])
                     if is_cuarray(bufs[i][n])  CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end
-                    # if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU
+                    if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU
                 end
             end
         end
@@ -417,7 +417,7 @@ let
 
     custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    wait_iwrite(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = synchronize(custreams[n,i]);
+    wait_iwrite(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
 
     function allocate_custreams_iwrite(fields::GGArray...)
         if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
@@ -445,7 +445,7 @@ let
 
     custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    wait_iread(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = synchronize(custreams[n,i]);
+    wait_iread(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
 
     function allocate_custreams_iread(fields::GGArray...)
         if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuArray
@@ -481,7 +481,7 @@ let
 
     rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    wait_iwrite(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = synchronize(rocstreams[n,i]);
+    wait_iwrite(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
 
     function allocate_rocstreams_iwrite(fields::GGArray...)
         if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCArray
@@ -492,15 +492,15 @@ let
     function iwrite_sendbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
-            if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+            # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
                 ranges = sendranges(n, dim, A);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                 @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
-            else
-                write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
-            end
+            # else
+            #     write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
+            # end
         end
     end
 end
@@ -510,7 +510,7 @@ let
 
     rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
 
-    wait_iread(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = synchronize(rocstreams[n,i]);
+    wait_iread(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
 
     function allocate_rocstreams_iread(fields::GGArray...)
         if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCArray
@@ -521,15 +521,15 @@ let
     function iread_recvbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
-            if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+            # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
                 ranges = recvranges(n, dim, A);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                 @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
-            else
-                read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
-            end
+            # else
+            #     read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
+            # end
         end
     end
 
@@ -688,11 +688,6 @@ end
 #     )
 #     return nothing
 # end
-function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
-    AMDGPU.stream!(rocstream)
-    AMDGPU.Base.copyto!(sendbuf, 1, A, 1, sendranges; async=true)
-    return nothing
-end
 
 # # Read from the receive buffer on the host and store on the array on the device (h2d).
 # function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, signal::HSASignal) where T <: GGNumber where T2 <: Integer
@@ -707,11 +702,6 @@ end
 #     )
 #     return nothing
 # end
-function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
-    AMDGPU.stream!(rocstream)
-    AMDGPU.Base.copyto!(recvbuf, 1, A, 1, recvranges)
-    return nothing
-end
 
 ##------------------------------
 ## FUNCTIONS TO SEND/RECV FIELDS

From a1b50fcce6b1c840ae5eb75cec3a57ff64c26573 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Tue, 18 Jul 2023 18:42:02 +0300
Subject: [PATCH 07/21] Test passing on single GPU

---
 test/test_update_halo.jl | 1688 +++++++++++++++++++-------------------
 1 file changed, 844 insertions(+), 844 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index e08c873..4344fa7 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -198,856 +198,856 @@ dz = 1.0
             end
             GG.free_update_halo_buffers();
             GG.allocate_bufs(Y, Z);
-            # for dim = 1:ndims(Y), n = 1:nneighbors_per_dim
-            #     @test all(size(sendbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
-            #     @test all(size(recvbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
-            # end
-            # for dim = 1:ndims(Z), n = 1:nneighbors_per_dim
-            #     @test all(size(sendbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
-            #     @test all(size(recvbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
-            # end
+            for dim = 1:ndims(Y), n = 1:nneighbors_per_dim
+                @test all(size(sendbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
+                @test all(size(recvbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim])
+            end
+            for dim = 1:ndims(Z), n = 1:nneighbors_per_dim
+                @test all(size(sendbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
+                @test all(size(recvbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim])
+            end
         end;
         finalize_global_grid(finalize_MPI=false);
     end;
 
-    # @testset "3. data transfer components" begin
-    #     @testset "iwrite_sendbufs! / iread_recvbufs!" begin
-    #         @testset "sendranges / recvranges ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-    #             P   = zeros(nx,  ny,  nz  );
-    #             A   = zeros(nx-1,ny+2,nz+1);
-    #             @test GG.sendranges(1, 1, P) == [                    2:2,             1:size(P,2),             1:size(P,3)]
-    #             @test GG.sendranges(2, 1, P) == [size(P,1)-1:size(P,1)-1,             1:size(P,2),             1:size(P,3)]
-    #             @test GG.sendranges(1, 2, P) == [            1:size(P,1),                     2:2,             1:size(P,3)]
-    #             @test GG.sendranges(2, 2, P) == [            1:size(P,1), size(P,2)-1:size(P,2)-1,             1:size(P,3)]
-    #             @test GG.sendranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     3:3]
-    #             @test GG.sendranges(2, 3, P) == [            1:size(P,1),             1:size(P,2), size(P,3)-2:size(P,3)-2]
-    #             @test GG.recvranges(1, 1, P) == [                    1:1,             1:size(P,2),             1:size(P,3)]
-    #             @test GG.recvranges(2, 1, P) == [    size(P,1):size(P,1),             1:size(P,2),             1:size(P,3)]
-    #             @test GG.recvranges(1, 2, P) == [            1:size(P,1),                     1:1,             1:size(P,3)]
-    #             @test GG.recvranges(2, 2, P) == [            1:size(P,1),     size(P,2):size(P,2),             1:size(P,3)]
-    #             @test GG.recvranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     1:1]
-    #             @test GG.recvranges(2, 3, P) == [            1:size(P,1),             1:size(P,2),     size(P,3):size(P,3)]
-    #             @test_throws ErrorException  GG.sendranges(1, 1, A)
-    #             @test_throws ErrorException  GG.sendranges(2, 1, A)
-    #             @test GG.sendranges(1, 2, A) == [            1:size(A,1),                     4:4,             1:size(A,3)]
-    #             @test GG.sendranges(2, 2, A) == [            1:size(A,1), size(A,2)-3:size(A,2)-3,             1:size(A,3)]
-    #             @test GG.sendranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     4:4]
-    #             @test GG.sendranges(2, 3, A) == [            1:size(A,1),             1:size(A,2), size(A,3)-3:size(A,3)-3]
-    #             @test_throws ErrorException  GG.recvranges(1, 1, A)
-    #             @test_throws ErrorException  GG.recvranges(2, 1, A)
-    #             @test GG.recvranges(1, 2, A) == [            1:size(A,1),                     1:1,             1:size(A,3)]
-    #             @test GG.recvranges(2, 2, A) == [            1:size(A,1),     size(A,2):size(A,2),             1:size(A,3)]
-    #             @test GG.recvranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     1:1]
-    #             @test GG.recvranges(2, 3, A) == [            1:size(A,1),             1:size(A,2),     size(A,3):size(A,3)]
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "write_h2h! / read_h2h!" begin
-    #             init_global_grid(nx, ny, nz; quiet=true, init_MPI=false);
-    #             P  = zeros(nx,  ny,  nz  );
-    #             P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-    #             P2 = zeros(size(P));
-    #             # (dim=1)
-    #             buf = zeros(size(P,2), size(P,3));
-    #             ranges = [2:2, 1:size(P,2), 1:size(P,3)];
-    #             GG.write_h2h!(buf, P, ranges, 1);
-    #             @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
-    #             GG.read_h2h!(buf, P2, ranges, 1);
-    #             @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
-    #             # (dim=2)
-    #             buf = zeros(size(P,1), size(P,3));
-    #             ranges = [1:size(P,1), 3:3, 1:size(P,3)];
-    #             GG.write_h2h!(buf, P, ranges, 2);
-    #             @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
-    #             GG.read_h2h!(buf, P2, ranges, 2);
-    #             @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
-    #             # (dim=3)
-    #             buf = zeros(size(P,1), size(P,2));
-    #             ranges = [1:size(P,1), 1:size(P,2), 4:4];
-    #             GG.write_h2h!(buf, P, ranges, 3);
-    #             @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
-    #             GG.read_h2h!(buf, P2, ranges, 3);
-    #             @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @static if test_cuda || test_amdgpu
-    #             @testset "write_d2x! / write_d2h_async! / read_x2d! / read_h2d_async! ($array_type arrays)" for (array_type, device_type, gpuzeros, GPUArray) in zip(gpu_array_types, gpu_device_types, gpu_allocators, GPUArrayConstructors)
-    #                 init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
-    #                 P  = zeros(nx,  ny,  nz  );
-    #                 P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-    #                 P  = GPUArray(P);
-    #                 if array_type == "CUDA"
-    #                     # (dim=1)
-    #                     dim = 1;
-    #                     P2  = gpuzeros(eltype(P),size(P));
-    #                     buf = zeros(size(P,2), size(P,3));
-    #                     buf_d, buf_h = GG.register(CuArray,buf);
-    #                     ranges = [2:2, 1:size(P,2), 1:size(P,3)];
-    #                     nthreads = (1, 1, 1);
-    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
-    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-    #                     @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     buf .= 0.0;
-    #                     P2  .= 0.0;
-    #                     custream = stream();
-    #                     GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     CUDA.Mem.unregister(buf_h);
-    #                     # (dim=2)
-    #                     dim = 2;
-    #                     P2  = gpuzeros(eltype(P),size(P));
-    #                     buf = zeros(size(P,1), size(P,3));
-    #                     buf_d, buf_h = GG.register(CuArray,buf);
-    #                     ranges = [1:size(P,1), 3:3, 1:size(P,3)];
-    #                     nthreads = (1, 1, 1);
-    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
-    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-    #                     @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     buf .= 0.0;
-    #                     P2  .= 0.0;
-    #                     custream = stream();
-    #                     GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     CUDA.Mem.unregister(buf_h);
-    #                     # (dim=3)
-    #                     dim = 3
-    #                     P2  = gpuzeros(eltype(P),size(P));
-    #                     buf = zeros(size(P,1), size(P,2));
-    #                     buf_d, buf_h = GG.register(CuArray,buf);
-    #                     ranges = [1:size(P,1), 1:size(P,2), 4:4];
-    #                     nthreads = (1, 1, 1);
-    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
-    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-    #                     @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     buf .= 0.0;
-    #                     P2  .= 0.0;
-    #                     custream = stream();
-    #                     GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     CUDA.Mem.unregister(buf_h);
-    #                 elseif array_type == "AMDGPU"
-    #                     @info "hi"
-    #                     # (dim=1)
-    #                     dim = 1;
-    #                     P2  = gpuzeros(eltype(P),size(P));
-    #                     buf = zeros(size(P,2), size(P,3));
-    #                     buf_d, buf_h = GG.register(ROCArray,buf);
-    #                     ranges = [2:2, 1:size(P,2), 1:size(P,3)];
-    #                     nthreads = (1, 1, 1);
-    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
-    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-    #                     @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # buf .= 0.0;
-    #                     # P2  .= 0.0;
-    #                     # rocstream = AMDGPU.HIPStream();
-    #                     # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-    #                     # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-    #                     # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # AMDGPU.Mem.unlock(buf_h);
-    #                     # (dim=2)
-    #                     dim = 2;
-    #                     P2  = gpuzeros(eltype(P),size(P));
-    #                     buf = zeros(size(P,1), size(P,3));
-    #                     buf_d, buf_h = GG.register(CuArray,buf);
-    #                     ranges = [1:size(P,1), 3:3, 1:size(P,3)];
-    #                     nthreads = (1, 1, 1);
-    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
-    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-    #                     @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # buf .= 0.0;
-    #                     # P2  .= 0.0;
-    #                     # rocstream = AMDGPU.HIPStream();
-    #                     # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-    #                     # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-    #                     # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # AMDGPU.Mem.unlock(buf_h);
-    #                     # (dim=3)
-    #                     dim = 3
-    #                     P2  = gpuzeros(eltype(P),size(P));
-    #                     buf = zeros(size(P,1), size(P,2));
-    #                     buf_d, buf_h = GG.register(CuArray,buf);
-    #                     ranges = [1:size(P,1), 1:size(P,2), 4:4];
-    #                     nthreads = (1, 1, 1);
-    #                     halosize = [r[end] - r[1] + 1 for r in ranges];
-    #                     nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-    #                     @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-    #                     @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
-    #                     @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # buf .= 0.0;
-    #                     # P2  .= 0.0;
-    #                     # rocstream = AMDGPU.HIPStream();
-    #                     # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-    #                     # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-    #                     # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-    #                     # AMDGPU.Mem.unlock(buf_h);
-    #                 end
-    #                 finalize_global_grid(finalize_MPI=false);
-    #             end;
-    #         end
-    #         @testset "iwrite_sendbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-    #             P = zeros(nx,  ny,  nz  );
-    #             A = zeros(nx-1,ny+2,nz+1);
-    #             P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]);
-    #             A .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]);
-    #             GG.allocate_bufs(P, A);
-    #             if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
-    #             elseif (array_type == "AMDGPU") GG.allocate_rocqueues(P, A);
-    #             else                            GG.allocate_tasks(P, A);
-    #             end
-    #             dim = 1
-    #             n = 1
-    #             GG.iwrite_sendbufs!(n, dim, P, 1);
-    #             GG.iwrite_sendbufs!(n, dim, A, 2);
-    #             GG.wait_iwrite(n, P, 1);
-    #             GG.wait_iwrite(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[2,:,:][:]))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
-    #             else
-    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[2,:,:][:]))
-    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
-    #             end
-    #             n = 2
-    #             GG.iwrite_sendbufs!(n, dim, P, 1);
-    #             GG.iwrite_sendbufs!(n, dim, A, 2);
-    #             GG.wait_iwrite(n, P, 1);
-    #             GG.wait_iwrite(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[end-1,:,:][:]))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
-    #             else
-    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[end-1,:,:][:]))
-    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
-    #             end
-    #             dim = 2
-    #             n = 1
-    #             GG.iwrite_sendbufs!(n, dim, P, 1);
-    #             GG.iwrite_sendbufs!(n, dim, A, 2);
-    #             GG.wait_iwrite(n, P, 1);
-    #             GG.wait_iwrite(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,2,:][:]))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,4,:][:]))
-    #             else
-    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,2,:][:]))
-    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,4,:][:]))
-    #             end
-    #             n = 2
-    #             GG.iwrite_sendbufs!(n, dim, P, 1);
-    #             GG.iwrite_sendbufs!(n, dim, A, 2);
-    #             GG.wait_iwrite(n, P, 1);
-    #             GG.wait_iwrite(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,end-1,:][:]))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,end-3,:][:]))
-    #             else
-    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,end-1,:][:]))
-    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,end-3,:][:]))
-    #             end
-    #             dim = 3
-    #             n = 1
-    #             GG.iwrite_sendbufs!(n, dim, P, 1);
-    #             GG.iwrite_sendbufs!(n, dim, A, 2);
-    #             GG.wait_iwrite(n, P, 1);
-    #             GG.wait_iwrite(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,3][:]))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,4][:]))
-    #             else
-    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,3][:]))
-    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,4][:]))
-    #             end
-    #             n = 2
-    #             GG.iwrite_sendbufs!(n, dim, P, 1);
-    #             GG.iwrite_sendbufs!(n, dim, A, 2);
-    #             GG.wait_iwrite(n, P, 1);
-    #             GG.wait_iwrite(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,end-2][:]))
-    #                 @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,end-3][:]))
-    #             else
-    #                 @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end-2][:]))
-    #                 @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end-3][:]))
-    #             end
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "iread_recvbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-    #             P = zeros(nx,  ny,  nz  );
-    #             A = zeros(nx-1,ny+2,nz+1);
-    #             GG.allocate_bufs(P, A);
-    #             if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
-    #             elseif (array_type == "AMDGPU") GG.allocate_rocqueues(P, A);
-    #             else                            GG.allocate_tasks(P, A);
-    #             end
-    #             dim = 1
-    #             for n = 1:nneighbors_per_dim
-    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                     GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                     GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                 else
-    #                     GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                     GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                 end
-    #             end
-    #             n = 1
-    #             GG.iread_recvbufs!(n, dim, P, 1);
-    #             GG.iread_recvbufs!(n, dim, A, 2);
-    #             GG.wait_iread(n, P, 1);
-    #             GG.wait_iread(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[1,:,:][:]))
-    #                 @test all(                          0.0 .== Array(A[1,:,:][:]))
-    #             else
-    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[1,:,:][:]))
-    #                 @test all(                       0.0 .== CPUArray(A[1,:,:][:]))
-    #             end
-    #             n = 2
-    #             GG.iread_recvbufs!(n, dim, P, 1);
-    #             GG.iread_recvbufs!(n, dim, A, 2);
-    #             GG.wait_iread(n, P, 1);
-    #             GG.wait_iread(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[end,:,:][:]))
-    #                 @test all(                          0.0 .== Array(A[end,:,:][:]))
-    #             else
-    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[end,:,:][:]))
-    #                 @test all(                       0.0 .== CPUArray(A[end,:,:][:]))
-    #             end
-    #             dim = 2
-    #             for n = 1:nneighbors_per_dim
-    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                     GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                     GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                 else
-    #                     GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                     GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                 end
-    #             end
-    #             n = 1
-    #             GG.iread_recvbufs!(n, dim, P, 1);
-    #             GG.iread_recvbufs!(n, dim, A, 2);
-    #             GG.wait_iread(n, P, 1);
-    #             GG.wait_iread(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,1,:][:]))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,1,:][:]))
-    #             else
-    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,1,:][:]))
-    #                 @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,1,:][:]))
-    #             end
-    #             n = 2
-    #             GG.iread_recvbufs!(n, dim, P, 1);
-    #             GG.iread_recvbufs!(n, dim, A, 2);
-    #             GG.wait_iread(n, P, 1);
-    #             GG.wait_iread(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,end,:][:]))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,end,:][:]))
-    #             else
-    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,end,:][:]))
-    #                 @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,end,:][:]))
-    #             end
-    #             dim = 3
-    #             for n = 1:nneighbors_per_dim
-    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                     GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                     GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                 else
-    #                     GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                     GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                 end
-    #             end
-    #             n = 1
-    #             GG.iread_recvbufs!(n, dim, P, 1);
-    #             GG.iread_recvbufs!(n, dim, A, 2);
-    #             GG.wait_iread(n, P, 1);
-    #             GG.wait_iread(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,1][:]))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,1][:]))
-    #             else
-    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,1][:]))
-    #                 @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,1][:]))
-    #             end
-    #             n = 2
-    #             GG.iread_recvbufs!(n, dim, P, 1);
-    #             GG.iread_recvbufs!(n, dim, A, 2);
-    #             GG.wait_iread(n, P, 1);
-    #             GG.wait_iread(n, A, 2);
-    #             if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,end][:]))
-    #                 @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,end][:]))
-    #             else
-    #                 @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end][:]))
-    #                 @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end][:]))
-    #             end
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         if (nprocs==1)
-    #             @testset "sendrecv_halo_local ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-    #                 init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-    #                 P = zeros(nx,  ny,  nz  );
-    #                 A = zeros(nx-1,ny+2,nz+1);
-    #                 GG.allocate_bufs(P, A);
-    #                 dim = 1
-    #                 for n = 1:nneighbors_per_dim
-    #                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                         GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                         GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                     else
-    #                         GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                         GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                     end
-    #                 end
-    #                 for n = 1:nneighbors_per_dim
-    #                     GG.sendrecv_halo_local(n, dim, P, 1);
-    #                     GG.sendrecv_halo_local(n, dim, A, 2);
-    #                 end
-    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                     @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-    #                     @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-    #                     @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-    #                     @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-    #                 else
-    #                     @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
-    #                     @test all(GG.recvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-    #                     @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
-    #                     @test all(GG.recvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-    #                 end
-    #                 dim = 2
-    #                 for n = 1:nneighbors_per_dim
-    #                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                         GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                         GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                     else
-    #                         GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                         GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                     end
-    #                 end
-    #                 for n = 1:nneighbors_per_dim
-    #                     GG.sendrecv_halo_local(n, dim, P, 1);
-    #                     GG.sendrecv_halo_local(n, dim, A, 2);
-    #                 end
-    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                     @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-    #                     @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
-    #                     @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-    #                     @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
-    #                 else
-    #                     @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
-    #                     @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
-    #                     @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
-    #                     @test all(GG.recvbuf_flat(2,dim,2,A) .== GG.sendbuf_flat(1,dim,2,A));
-    #                 end
-    #                 dim = 3
-    #                 for n = 1:nneighbors_per_dim
-    #                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                         GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                         GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                     else
-    #                         GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
-    #                         GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
-    #                     end
-    #                 end
-    #                 for n = 1:nneighbors_per_dim
-    #                     GG.sendrecv_halo_local(n, dim, P, 1);
-    #                     GG.sendrecv_halo_local(n, dim, A, 2);
-    #                 end
-    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                     @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-    #                     @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
-    #                     @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-    #                     @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
-    #                 else
-    #                     @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
-    #                     @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
-    #                     @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
-    #                     @test all(GG.recvbuf_flat(2,dim,2,A) .== GG.sendbuf_flat(1,dim,2,A));
-    #                 end
-    #                 finalize_global_grid(finalize_MPI=false);
-    #             end
-    #         end
-    #     end;
-    #     if (nprocs>1)
-    #         @testset "irecv_halo! / isend_halo ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
-    #             me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             P   = zeros(nx,ny,nz);
-    #             A   = zeros(nx,ny,nz);
-    #             dim = 1;
-    #             GG.allocate_bufs(P, A);
-    #             for n = 1:nneighbors_per_dim
-    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                     GG.gpusendbuf(n,dim,1,P) .= 9.0;
-    #                     GG.gpurecvbuf(n,dim,1,P) .= 0;
-    #                     GG.gpusendbuf(n,dim,2,A) .= 9.0;
-    #                     GG.gpurecvbuf(n,dim,2,A) .= 0;
-    #                 else
-    #                     GG.sendbuf(n,dim,1,P) .= 9.0;
-    #                     GG.recvbuf(n,dim,1,P) .= 0;
-    #                     GG.sendbuf(n,dim,2,A) .= 9.0;
-    #                     GG.recvbuf(n,dim,2,A) .= 0;
-    #                 end
-    #             end
-    #             reqs  = fill(MPI.REQUEST_NULL, 2, nneighbors_per_dim, 2);
-    #             for n = 1:nneighbors_per_dim
-    #                 reqs[1,n,1] = GG.irecv_halo!(n, dim, P, 1);
-    #                 reqs[2,n,1] = GG.irecv_halo!(n, dim, A, 2);
-    #                 reqs[1,n,2] = GG.isend_halo(n, dim, P, 1);
-    #                 reqs[2,n,2] = GG.isend_halo(n, dim, A, 2);
-    #             end
-    #             @test all(reqs .!= [MPI.REQUEST_NULL])
-    #             MPI.Waitall!(reqs[:]);
-    #             for n = 1:nneighbors_per_dim
-    #                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-    #                     @test all(GG.gpurecvbuf(n,dim,1,P) .== 9.0)
-    #                     @test all(GG.gpurecvbuf(n,dim,2,A) .== 9.0)
-    #                 else
-    #                     @test all(GG.recvbuf(n,dim,1,P) .== 9.0)
-    #                     @test all(GG.recvbuf(n,dim,2,A) .== 9.0)
-    #                 end
-    #             end
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #     end
-    # end;
+    @testset "3. data transfer components" begin
+        @testset "iwrite_sendbufs! / iread_recvbufs!" begin
+            @testset "sendranges / recvranges ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                P   = zeros(nx,  ny,  nz  );
+                A   = zeros(nx-1,ny+2,nz+1);
+                @test GG.sendranges(1, 1, P) == [                    2:2,             1:size(P,2),             1:size(P,3)]
+                @test GG.sendranges(2, 1, P) == [size(P,1)-1:size(P,1)-1,             1:size(P,2),             1:size(P,3)]
+                @test GG.sendranges(1, 2, P) == [            1:size(P,1),                     2:2,             1:size(P,3)]
+                @test GG.sendranges(2, 2, P) == [            1:size(P,1), size(P,2)-1:size(P,2)-1,             1:size(P,3)]
+                @test GG.sendranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     3:3]
+                @test GG.sendranges(2, 3, P) == [            1:size(P,1),             1:size(P,2), size(P,3)-2:size(P,3)-2]
+                @test GG.recvranges(1, 1, P) == [                    1:1,             1:size(P,2),             1:size(P,3)]
+                @test GG.recvranges(2, 1, P) == [    size(P,1):size(P,1),             1:size(P,2),             1:size(P,3)]
+                @test GG.recvranges(1, 2, P) == [            1:size(P,1),                     1:1,             1:size(P,3)]
+                @test GG.recvranges(2, 2, P) == [            1:size(P,1),     size(P,2):size(P,2),             1:size(P,3)]
+                @test GG.recvranges(1, 3, P) == [            1:size(P,1),             1:size(P,2),                     1:1]
+                @test GG.recvranges(2, 3, P) == [            1:size(P,1),             1:size(P,2),     size(P,3):size(P,3)]
+                @test_throws ErrorException  GG.sendranges(1, 1, A)
+                @test_throws ErrorException  GG.sendranges(2, 1, A)
+                @test GG.sendranges(1, 2, A) == [            1:size(A,1),                     4:4,             1:size(A,3)]
+                @test GG.sendranges(2, 2, A) == [            1:size(A,1), size(A,2)-3:size(A,2)-3,             1:size(A,3)]
+                @test GG.sendranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     4:4]
+                @test GG.sendranges(2, 3, A) == [            1:size(A,1),             1:size(A,2), size(A,3)-3:size(A,3)-3]
+                @test_throws ErrorException  GG.recvranges(1, 1, A)
+                @test_throws ErrorException  GG.recvranges(2, 1, A)
+                @test GG.recvranges(1, 2, A) == [            1:size(A,1),                     1:1,             1:size(A,3)]
+                @test GG.recvranges(2, 2, A) == [            1:size(A,1),     size(A,2):size(A,2),             1:size(A,3)]
+                @test GG.recvranges(1, 3, A) == [            1:size(A,1),             1:size(A,2),                     1:1]
+                @test GG.recvranges(2, 3, A) == [            1:size(A,1),             1:size(A,2),     size(A,3):size(A,3)]
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "write_h2h! / read_h2h!" begin
+                init_global_grid(nx, ny, nz; quiet=true, init_MPI=false);
+                P  = zeros(nx,  ny,  nz  );
+                P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+                P2 = zeros(size(P));
+                # (dim=1)
+                buf = zeros(size(P,2), size(P,3));
+                ranges = [2:2, 1:size(P,2), 1:size(P,3)];
+                GG.write_h2h!(buf, P, ranges, 1);
+                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+                GG.read_h2h!(buf, P2, ranges, 1);
+                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+                # (dim=2)
+                buf = zeros(size(P,1), size(P,3));
+                ranges = [1:size(P,1), 3:3, 1:size(P,3)];
+                GG.write_h2h!(buf, P, ranges, 2);
+                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+                GG.read_h2h!(buf, P2, ranges, 2);
+                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+                # (dim=3)
+                buf = zeros(size(P,1), size(P,2));
+                ranges = [1:size(P,1), 1:size(P,2), 4:4];
+                GG.write_h2h!(buf, P, ranges, 3);
+                @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:])
+                GG.read_h2h!(buf, P2, ranges, 3);
+                @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:])
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @static if test_cuda || test_amdgpu
+                @testset "write_d2x! / write_d2h_async! / read_x2d! / read_h2d_async! ($array_type arrays)" for (array_type, device_type, gpuzeros, GPUArray) in zip(gpu_array_types, gpu_device_types, gpu_allocators, GPUArrayConstructors)
+                    init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
+                    P  = zeros(nx,  ny,  nz  );
+                    P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+                    P  = GPUArray(P);
+                    if array_type == "CUDA"
+                        # (dim=1)
+                        dim = 1;
+                        P2  = gpuzeros(eltype(P),size(P));
+                        buf = zeros(size(P,2), size(P,3));
+                        buf_d, buf_h = GG.register(CuArray,buf);
+                        ranges = [2:2, 1:size(P,2), 1:size(P,3)];
+                        nthreads = (1, 1, 1);
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        buf .= 0.0;
+                        P2  .= 0.0;
+                        custream = stream();
+                        GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        CUDA.Mem.unregister(buf_h);
+                        # (dim=2)
+                        dim = 2;
+                        P2  = gpuzeros(eltype(P),size(P));
+                        buf = zeros(size(P,1), size(P,3));
+                        buf_d, buf_h = GG.register(CuArray,buf);
+                        ranges = [1:size(P,1), 3:3, 1:size(P,3)];
+                        nthreads = (1, 1, 1);
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        buf .= 0.0;
+                        P2  .= 0.0;
+                        custream = stream();
+                        GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        CUDA.Mem.unregister(buf_h);
+                        # (dim=3)
+                        dim = 3
+                        P2  = gpuzeros(eltype(P),size(P));
+                        buf = zeros(size(P,1), size(P,2));
+                        buf_d, buf_h = GG.register(CuArray,buf);
+                        ranges = [1:size(P,1), 1:size(P,2), 4:4];
+                        nthreads = (1, 1, 1);
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @cuda blocks=nblocks threads=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        @cuda blocks=nblocks threads=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        buf .= 0.0;
+                        P2  .= 0.0;
+                        custream = stream();
+                        GG.write_d2h_async!(buf, P, ranges, custream); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        GG.read_h2d_async!(buf, P2, ranges, custream); CUDA.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        CUDA.Mem.unregister(buf_h);
+                    elseif array_type == "AMDGPU"
+                        @info "needs async memcopy fix"
+                        # (dim=1)
+                        dim = 1;
+                        P2  = gpuzeros(eltype(P),size(P));
+                        buf = zeros(size(P,2), size(P,3));
+                        buf_d = GG.register(ROCArray,buf);
+                        ranges = [2:2, 1:size(P,2), 1:size(P,3)];
+                        nthreads = (1, 1, 1);
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        buf .= 0.0;
+                        P2  .= 0.0;
+                        # rocstream = AMDGPU.HIPStream();
+                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        # AMDGPU.unsafe_free!(buf_d);
+                        # (dim=2)
+                        dim = 2;
+                        P2  = gpuzeros(eltype(P),size(P));
+                        buf = zeros(size(P,1), size(P,3));
+                        buf_d = GG.register(ROCArray,buf);
+                        ranges = [1:size(P,1), 3:3, 1:size(P,3)];
+                        nthreads = (1, 1, 1);
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        buf .= 0.0;
+                        P2  .= 0.0;
+                        # rocstream = AMDGPU.HIPStream();
+                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        # AMDGPU.unsafe_free!(buf_d);
+                        # (dim=3)
+                        dim = 3
+                        P2  = gpuzeros(eltype(P),size(P));
+                        buf = zeros(size(P,1), size(P,2));
+                        buf_d = GG.register(ROCArray,buf);
+                        ranges = [1:size(P,1), 1:size(P,2), 4:4];
+                        nthreads = (1, 1, 1);
+                        halosize = [r[end] - r[1] + 1 for r in ranges];
+                        nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                        @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        buf .= 0.0;
+                        P2  .= 0.0;
+                        # rocstream = AMDGPU.HIPStream();
+                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        # AMDGPU.unsafe_free!(buf_d);
+                    end
+                    finalize_global_grid(finalize_MPI=false);
+                end;
+            end
+            @testset "iwrite_sendbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                P = zeros(nx,  ny,  nz  );
+                A = zeros(nx-1,ny+2,nz+1);
+                P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]);
+                A .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]);
+                GG.allocate_bufs(P, A);
+                if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
+                elseif (array_type == "AMDGPU") GG.allocate_rocstreams(P, A);
+                else                            GG.allocate_tasks(P, A);
+                end
+                dim = 1
+                n = 1
+                GG.iwrite_sendbufs!(n, dim, P, 1);
+                GG.iwrite_sendbufs!(n, dim, A, 2);
+                GG.wait_iwrite(n, P, 1);
+                GG.wait_iwrite(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[2,:,:][:]))
+                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
+                else
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[2,:,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
+                end
+                n = 2
+                GG.iwrite_sendbufs!(n, dim, P, 1);
+                GG.iwrite_sendbufs!(n, dim, A, 2);
+                GG.wait_iwrite(n, P, 1);
+                GG.wait_iwrite(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[end-1,:,:][:]))
+                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
+                else
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[end-1,:,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
+                end
+                dim = 2
+                n = 1
+                GG.iwrite_sendbufs!(n, dim, P, 1);
+                GG.iwrite_sendbufs!(n, dim, A, 2);
+                GG.wait_iwrite(n, P, 1);
+                GG.wait_iwrite(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,2,:][:]))
+                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,4,:][:]))
+                else
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,2,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,4,:][:]))
+                end
+                n = 2
+                GG.iwrite_sendbufs!(n, dim, P, 1);
+                GG.iwrite_sendbufs!(n, dim, A, 2);
+                GG.wait_iwrite(n, P, 1);
+                GG.wait_iwrite(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,end-1,:][:]))
+                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,end-3,:][:]))
+                else
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,end-1,:][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,end-3,:][:]))
+                end
+                dim = 3
+                n = 1
+                GG.iwrite_sendbufs!(n, dim, P, 1);
+                GG.iwrite_sendbufs!(n, dim, A, 2);
+                GG.wait_iwrite(n, P, 1);
+                GG.wait_iwrite(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,3][:]))
+                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,4][:]))
+                else
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,3][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,4][:]))
+                end
+                n = 2
+                GG.iwrite_sendbufs!(n, dim, P, 1);
+                GG.iwrite_sendbufs!(n, dim, A, 2);
+                GG.wait_iwrite(n, P, 1);
+                GG.wait_iwrite(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,end-2][:]))
+                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,end-3][:]))
+                else
+                    @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end-2][:]))
+                    @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end-3][:]))
+                end
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "iread_recvbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors)
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                P = zeros(nx,  ny,  nz  );
+                A = zeros(nx-1,ny+2,nz+1);
+                GG.allocate_bufs(P, A);
+                if     (array_type == "CUDA")   GG.allocate_custreams(P, A);
+                elseif (array_type == "AMDGPU") GG.allocate_rocstreams(P, A);
+                else                            GG.allocate_tasks(P, A);
+                end
+                dim = 1
+                for n = 1:nneighbors_per_dim
+                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                        GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                        GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                    else
+                        GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                        GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                    end
+                end
+                n = 1
+                GG.iread_recvbufs!(n, dim, P, 1);
+                GG.iread_recvbufs!(n, dim, A, 2);
+                GG.wait_iread(n, P, 1);
+                GG.wait_iread(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[1,:,:][:]))
+                    @test all(                          0.0 .== Array(A[1,:,:][:]))
+                else
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[1,:,:][:]))
+                    @test all(                       0.0 .== CPUArray(A[1,:,:][:]))
+                end
+                n = 2
+                GG.iread_recvbufs!(n, dim, P, 1);
+                GG.iread_recvbufs!(n, dim, A, 2);
+                GG.wait_iread(n, P, 1);
+                GG.wait_iread(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[end,:,:][:]))
+                    @test all(                          0.0 .== Array(A[end,:,:][:]))
+                else
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[end,:,:][:]))
+                    @test all(                       0.0 .== CPUArray(A[end,:,:][:]))
+                end
+                dim = 2
+                for n = 1:nneighbors_per_dim
+                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                        GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                        GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                    else
+                        GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                        GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                    end
+                end
+                n = 1
+                GG.iread_recvbufs!(n, dim, P, 1);
+                GG.iread_recvbufs!(n, dim, A, 2);
+                GG.wait_iread(n, P, 1);
+                GG.wait_iread(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,1,:][:]))
+                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,1,:][:]))
+                else
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,1,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,1,:][:]))
+                end
+                n = 2
+                GG.iread_recvbufs!(n, dim, P, 1);
+                GG.iread_recvbufs!(n, dim, A, 2);
+                GG.wait_iread(n, P, 1);
+                GG.wait_iread(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,end,:][:]))
+                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,end,:][:]))
+                else
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,end,:][:]))
+                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,end,:][:]))
+                end
+                dim = 3
+                for n = 1:nneighbors_per_dim
+                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                        GG.gpurecvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                        GG.gpurecvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                    else
+                        GG.recvbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                        GG.recvbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                    end
+                end
+                n = 1
+                GG.iread_recvbufs!(n, dim, P, 1);
+                GG.iread_recvbufs!(n, dim, A, 2);
+                GG.wait_iread(n, P, 1);
+                GG.wait_iread(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,1][:]))
+                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,1][:]))
+                else
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,1][:]))
+                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,1][:]))
+                end
+                n = 2
+                GG.iread_recvbufs!(n, dim, P, 1);
+                GG.iread_recvbufs!(n, dim, A, 2);
+                GG.wait_iread(n, P, 1);
+                GG.wait_iread(n, A, 2);
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,end][:]))
+                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,end][:]))
+                else
+                    @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end][:]))
+                    @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end][:]))
+                end
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            if (nprocs==1)
+                @testset "sendrecv_halo_local ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
+                    init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                    P = zeros(nx,  ny,  nz  );
+                    A = zeros(nx-1,ny+2,nz+1);
+                    GG.allocate_bufs(P, A);
+                    dim = 1
+                    for n = 1:nneighbors_per_dim
+                        if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                            GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                            GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                        else
+                            GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                            GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                        end
+                    end
+                    for n = 1:nneighbors_per_dim
+                        GG.sendrecv_halo_local(n, dim, P, 1);
+                        GG.sendrecv_halo_local(n, dim, A, 2);
+                    end
+                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
+                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
+                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+                    else
+                        @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
+                        @test all(GG.recvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+                        @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
+                        @test all(GG.recvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+                    end
+                    dim = 2
+                    for n = 1:nneighbors_per_dim
+                        if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                            GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                            GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                        else
+                            GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                            GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                        end
+                    end
+                    for n = 1:nneighbors_per_dim
+                        GG.sendrecv_halo_local(n, dim, P, 1);
+                        GG.sendrecv_halo_local(n, dim, A, 2);
+                    end
+                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
+                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
+                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
+                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
+                    else
+                        @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
+                        @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
+                        @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
+                        @test all(GG.recvbuf_flat(2,dim,2,A) .== GG.sendbuf_flat(1,dim,2,A));
+                    end
+                    dim = 3
+                    for n = 1:nneighbors_per_dim
+                        if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                            GG.gpusendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                            GG.gpusendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                        else
+                            GG.sendbuf_flat(n,dim,1,P) .= dim*1e2 + n*1e1 + 1;
+                            GG.sendbuf_flat(n,dim,2,A) .= dim*1e2 + n*1e1 + 2;
+                        end
+                    end
+                    for n = 1:nneighbors_per_dim
+                        GG.sendrecv_halo_local(n, dim, P, 1);
+                        GG.sendrecv_halo_local(n, dim, A, 2);
+                    end
+                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
+                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
+                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
+                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
+                    else
+                        @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
+                        @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
+                        @test all(GG.recvbuf_flat(2,dim,1,P) .== GG.sendbuf_flat(1,dim,1,P));
+                        @test all(GG.recvbuf_flat(2,dim,2,A) .== GG.sendbuf_flat(1,dim,2,A));
+                    end
+                    finalize_global_grid(finalize_MPI=false);
+                end
+            end
+        end;
+        if (nprocs>1)
+            @testset "irecv_halo! / isend_halo ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators)
+                me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, quiet=true, init_MPI=false, device_type=device_type);
+                P   = zeros(nx,ny,nz);
+                A   = zeros(nx,ny,nz);
+                dim = 1;
+                GG.allocate_bufs(P, A);
+                for n = 1:nneighbors_per_dim
+                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                        GG.gpusendbuf(n,dim,1,P) .= 9.0;
+                        GG.gpurecvbuf(n,dim,1,P) .= 0;
+                        GG.gpusendbuf(n,dim,2,A) .= 9.0;
+                        GG.gpurecvbuf(n,dim,2,A) .= 0;
+                    else
+                        GG.sendbuf(n,dim,1,P) .= 9.0;
+                        GG.recvbuf(n,dim,1,P) .= 0;
+                        GG.sendbuf(n,dim,2,A) .= 9.0;
+                        GG.recvbuf(n,dim,2,A) .= 0;
+                    end
+                end
+                reqs  = fill(MPI.REQUEST_NULL, 2, nneighbors_per_dim, 2);
+                for n = 1:nneighbors_per_dim
+                    reqs[1,n,1] = GG.irecv_halo!(n, dim, P, 1);
+                    reqs[2,n,1] = GG.irecv_halo!(n, dim, A, 2);
+                    reqs[1,n,2] = GG.isend_halo(n, dim, P, 1);
+                    reqs[2,n,2] = GG.isend_halo(n, dim, A, 2);
+                end
+                @test all(reqs .!= [MPI.REQUEST_NULL])
+                MPI.Waitall!(reqs[:]);
+                for n = 1:nneighbors_per_dim
+                    if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                        @test all(GG.gpurecvbuf(n,dim,1,P) .== 9.0)
+                        @test all(GG.gpurecvbuf(n,dim,2,A) .== 9.0)
+                    else
+                        @test all(GG.recvbuf(n,dim,1,P) .== 9.0)
+                        @test all(GG.recvbuf(n,dim,2,A) .== 9.0)
+                    end
+                end
+                finalize_global_grid(finalize_MPI=false);
+            end;
+        end
+    end;
 
     # (Backup field filled with encoded coordinates and set boundary to zeros; then update halo and compare with backuped field; it should be the same again, except for the boundaries that are not halos)
-    # @testset "4. halo update ($array_type arrays)" for (array_type, device_type, Array) in zip(array_types, device_types, ArrayConstructors)
-    #     @testset "basic grid (default: periodic)" begin
-    #         @testset "1D" begin
-    #             init_global_grid(nx, 1, 1; periodx=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             P     = zeros(nx);
-    #             P    .= [x_g(ix,dx,P) for ix=1:size(P,1)];
-    #             P_ref = copy(P);
-    #             P[[1, end]] .= 0.0;
-    #             P     = Array(P);
-    #             P_ref = Array(P_ref);
-    #             @require !all(CPUArray(P .== P_ref)) # DEBUG: CPUArray needed here and onwards as mapreduce! is failing on AMDGPU (see https://github.com/JuliaGPU/AMDGPU.jl/issues/210)
-    #             update_halo!(P);
-    #             @test all(CPUArray(P .== P_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "2D" begin
-    #             init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             P     = zeros(nx, ny);
-    #             P    .= [y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2)];
-    #             P_ref = copy(P);
-    #             P[[1, end],       :] .= 0.0;
-    #             P[       :,[1, end]] .= 0.0;
-    #             P     = Array(P);
-    #             P_ref = Array(P_ref);
-    #             @require !all(CPUArray(P .== P_ref))
-    #             update_halo!(P);
-    #             @test all(CPUArray(P .== P_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "3D" begin
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             P     = zeros(nx, ny, nz);
-    #             P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-    #             P_ref = copy(P);
-    #             P[[1, end],       :,       :] .= 0.0;
-    #             P[       :,[1, end],       :] .= 0.0;
-    #             P[       :,       :,[1, end]] .= 0.0;
-    #             P     = Array(P);
-    #             P_ref = Array(P_ref);
-    #             @require !all(CPUArray(P .== P_ref))
-    #             update_halo!(P);
-    #             @test all(CPUArray(P .== P_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "3D (non-default overlap)" begin
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=4, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-    #             P     = zeros(nx, ny, nz);
-    #             P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-    #             P_ref = copy(P);
-    #             P[[1, end],       :,       :] .= 0.0;
-    #             P[       :,[1, end],       :] .= 0.0;
-    #             P[       :,       :,[1, end]] .= 0.0;
-    #             P     = Array(P);
-    #             P_ref = Array(P_ref);
-    #             @require !all(CPUArray(P .== P_ref))
-    #             update_halo!(P);
-    #             @test all(CPUArray(P .== P_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "3D (not periodic)" begin
-    #             me, dims, nprocs, coords = init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
-    #             P     = zeros(nx, ny, nz);
-    #             P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
-    #             P_ref = copy(P);
-    #             P[[1, end],       :,       :] .= 0.0;
-    #             P[       :,[1, end],       :] .= 0.0;
-    #             P[       :,       :,[1, end]] .= 0.0;
-    #             P     = Array(P);
-    #             P_ref = Array(P_ref);
-    #             @require !all(CPUArray(P .== P_ref))
-    #             update_halo!(P);
-    #             @test all(CPUArray(P[2:end-1,2:end-1,2:end-1] .== P_ref[2:end-1,2:end-1,2:end-1]))
-    #             if (coords[1] ==         0) @test all(CPUArray(P[  1,  :,  :] .== 0.0)); else @test all(CPUArray(P[      1,2:end-1,2:end-1] .== P_ref[      1,2:end-1,2:end-1])); end  # Verifcation of corner values would be cumbersome here; it is already sufficiently covered in the periodic tests.
-    #             if (coords[1] == dims[1]-1) @test all(CPUArray(P[end,  :,  :] .== 0.0)); else @test all(CPUArray(P[    end,2:end-1,2:end-1] .== P_ref[    end,2:end-1,2:end-1])); end
-    #             if (coords[2] ==         0) @test all(CPUArray(P[  :,  1,  :] .== 0.0)); else @test all(CPUArray(P[2:end-1,      1,2:end-1] .== P_ref[2:end-1,      1,2:end-1])); end
-    #             if (coords[2] == dims[2]-1) @test all(CPUArray(P[  :,end,  :] .== 0.0)); else @test all(CPUArray(P[2:end-1,    end,2:end-1] .== P_ref[2:end-1,    end,2:end-1])); end
-    #             if (coords[3] ==         0) @test all(CPUArray(P[  :,  :,  1] .== 0.0)); else @test all(CPUArray(P[2:end-1,2:end-1,      1] .== P_ref[2:end-1,2:end-1,      1])); end
-    #             if (coords[3] == dims[3]-1) @test all(CPUArray(P[  :,  :,end] .== 0.0)); else @test all(CPUArray(P[2:end-1,2:end-1,    end] .== P_ref[2:end-1,2:end-1,    end])); end
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #     end;
-    #     @testset "staggered grid (default: periodic)" begin
-    #         @testset "1D" begin
-    #             init_global_grid(nx, 1, 1; periodx=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             Vx     = zeros(nx+1);
-    #             Vx    .= [x_g(ix,dx,Vx) for ix=1:size(Vx,1)];
-    #             Vx_ref = copy(Vx);
-    #             Vx[[1, end]] .= 0.0;
-    #             Vx     = Array(Vx);
-    #             Vx_ref = Array(Vx_ref);
-    #             @require !all(CPUArray(Vx .== Vx_ref))
-    #             update_halo!(Vx);
-    #             @test all(CPUArray(Vx .== Vx_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "2D" begin
-    #             init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             Vy     = zeros(nx,ny+1);
-    #             Vy    .= [y_g(iy,dy,Vy)*1e1 + x_g(ix,dx,Vy) for ix=1:size(Vy,1), iy=1:size(Vy,2)];
-    #             Vy_ref = copy(Vy);
-    #             Vy[[1, end],       :] .= 0.0;
-    #             Vy[       :,[1, end]] .= 0.0;
-    #             Vy     = Array(Vy);
-    #             Vy_ref = Array(Vy_ref);
-    #             @require !all(CPUArray(Vy .== Vy_ref))
-    #             update_halo!(Vy);
-    #             @test all(CPUArray(Vy .== Vy_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "3D" begin
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             Vz     = zeros(nx,ny,nz+1);
-    #             Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-    #             Vz_ref = copy(Vz);
-    #             Vz[[1, end],       :,       :] .= 0.0;
-    #             Vz[       :,[1, end],       :] .= 0.0;
-    #             Vz[       :,       :,[1, end]] .= 0.0;
-    #             Vz     = Array(Vz);
-    #             Vz_ref = Array(Vz_ref);
-    #             @require !all(CPUArray(Vz .== Vz_ref))
-    #             update_halo!(Vz);
-    #             @test all(CPUArray(Vz .== Vz_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "3D (non-default overlap)" begin
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=3, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
-    #             Vx     = zeros(nx+1,ny,nz);
-    #             Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
-    #             Vx_ref = copy(Vx);
-    #             Vx[[1, end],       :,       :] .= 0.0;
-    #             Vx[       :,[1, end],       :] .= 0.0;
-    #             Vx[       :,       :,[1, end]] .= 0.0;
-    #             Vx     = Array(Vx);
-    #             Vx_ref = Array(Vx_ref);
-    #             @require !all(CPUArray(Vx .== Vx_ref))
-    #             update_halo!(Vx);
-    #             @test all(CPUArray(Vx .== Vx_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "3D (not periodic)" begin
-    #             me, dims, nprocs, coords = init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
-    #             Vz     = zeros(nx,ny,nz+1);
-    #             Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-    #             Vz_ref = copy(Vz);
-    #             Vz[[1, end],       :,       :] .= 0.0;
-    #             Vz[       :,[1, end],       :] .= 0.0;
-    #             Vz[       :,       :,[1, end]] .= 0.0;
-    #             Vz     = Array(Vz);
-    #             Vz_ref = Array(Vz_ref);
-    #             @require !all(CPUArray(Vz .== Vz_ref))
-    #             update_halo!(Vz);
-    #             @test all(CPUArray(Vz[2:end-1,2:end-1,2:end-1] .== Vz_ref[2:end-1,2:end-1,2:end-1]))
-    #             if (coords[1] ==         0) @test all(CPUArray(Vz[  1,  :,  :] .== 0.0)); else @test all(CPUArray(Vz[      1,2:end-1,2:end-1] .== Vz_ref[      1,2:end-1,2:end-1])); end  # Verifcation of corner values would be cumbersome here; it is already sufficiently covered in the periodic tests.
-    #             if (coords[1] == dims[1]-1) @test all(CPUArray(Vz[end,  :,  :] .== 0.0)); else @test all(CPUArray(Vz[    end,2:end-1,2:end-1] .== Vz_ref[    end,2:end-1,2:end-1])); end
-    #             if (coords[2] ==         0) @test all(CPUArray(Vz[  :,  1,  :] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,      1,2:end-1] .== Vz_ref[2:end-1,      1,2:end-1])); end
-    #             if (coords[2] == dims[2]-1) @test all(CPUArray(Vz[  :,end,  :] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,    end,2:end-1] .== Vz_ref[2:end-1,    end,2:end-1])); end
-    #             if (coords[3] ==         0) @test all(CPUArray(Vz[  :,  :,  1] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,2:end-1,      1] .== Vz_ref[2:end-1,2:end-1,      1])); end
-    #             if (coords[3] == dims[3]-1) @test all(CPUArray(Vz[  :,  :,end] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,2:end-1,    end] .== Vz_ref[2:end-1,2:end-1,    end])); end
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "2D (no halo in one dim)" begin
-    #             init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             A     = zeros(nx-1,ny+2);
-    #             A    .= [y_g(iy,dy,A)*1e1 + x_g(ix,dx,A) for ix=1:size(A,1), iy=1:size(A,2)];
-    #             A_ref = copy(A);
-    #             A[[1, end],       :] .= 0.0;
-    #             A[       :,[1, end]] .= 0.0;
-    #             A     = Array(A);
-    #             A_ref = Array(A_ref);
-    #             @require !all(CPUArray(A .== A_ref))
-    #             update_halo!(A);
-    #             @test all(CPUArray(A[2:end-1,:] .== A_ref[2:end-1,:]))
-    #             @test all(CPUArray(A[[1, end],:] .== 0.0))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "3D (no halo in one dim)" begin
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             A     = zeros(nx+2,ny-1,nz+1);
-    #             A    .= [z_g(iz,dz,A)*1e2 + y_g(iy,dy,A)*1e1 + x_g(ix,dx,A) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)];
-    #             A_ref = copy(A);
-    #             A[[1, end],       :,       :] .= 0.0;
-    #             A[       :,[1, end],       :] .= 0.0;
-    #             A[       :,       :,[1, end]] .= 0.0;
-    #             A     = Array(A);
-    #             A_ref = Array(A_ref);
-    #             @require !all(CPUArray(A .== A_ref))
-    #             update_halo!(A);
-    #             @test all(CPUArray(A[:,2:end-1,:] .== A_ref[:,2:end-1,:]))
-    #             @test all(CPUArray(A[:,[1, end],:] .== 0.0))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         @testset "3D (Complex)" begin
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             Vz     = zeros(ComplexF16,nx,ny,nz+1);
-    #             Vz    .= [(1+im)*(z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz)) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-    #             Vz_ref = copy(Vz);
-    #             Vz[[1, end],       :,       :] .= 0.0;
-    #             Vz[       :,[1, end],       :] .= 0.0;
-    #             Vz[       :,       :,[1, end]] .= 0.0;
-    #             Vz     = Array(Vz);
-    #             Vz_ref = Array(Vz_ref);
-    #             @require !all(CPUArray(Vz .== Vz_ref))
-    #             update_halo!(Vz);
-    #             @test all(CPUArray(Vz .== Vz_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #         # @testset "3D (changing datatype)" begin
-    #         #     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-    #         #     Vz     = zeros(nx,ny,nz+1);
-    #         #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-    #         #     Vz_ref = copy(Vz);
-    #         #     Vx     = zeros(Float32,nx+1,ny,nz);
-    #         #     Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
-    #         #     Vx_ref = copy(Vx);
-    #         #     Vz[[1, end],       :,       :] .= 0.0;
-    #         #     Vz[       :,[1, end],       :] .= 0.0;
-    #         #     Vz[       :,       :,[1, end]] .= 0.0;
-    #         #     Vz     = Array(Vz);
-    #         #     Vz_ref = Array(Vz_ref);
-    #         #     @require !all(Vz .== Vz_ref)
-    #         #     update_halo!(Vz);
-    #         #     @test all(Vz .== Vz_ref)
-    #         #     Vx[[1, end],       :,       :] .= 0.0;
-    #         #     Vx[       :,[1, end],       :] .= 0.0;
-    #         #     Vx[       :,       :,[1, end]] .= 0.0;
-    #         #     Vx     = Array(Vx);
-    #         #     Vx_ref = Array(Vx_ref);
-    #         #     @require !all(Vx .== Vx_ref)
-    #         #     update_halo!(Vx);
-    #         #     @test all(Vx .== Vx_ref)
-    #         #     #TODO: added for GPU - quick fix:
-    #         #     Vz     = zeros(nx,ny,nz+1);
-    #         #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-    #         #     Vz_ref = copy(Vz);
-    #         #     Vz[[1, end],       :,       :] .= 0.0;
-    #         #     Vz[       :,[1, end],       :] .= 0.0;
-    #         #     Vz[       :,       :,[1, end]] .= 0.0;
-    #         #     Vz     = Array(Vz);
-    #         #     Vz_ref = Array(Vz_ref);
-    #         #     @require !all(Vz .== Vz_ref)
-    #         #     update_halo!(Vz);
-    #         #     @test all(Vz .== Vz_ref)
-    #         #     finalize_global_grid(finalize_MPI=false);
-    #         # end;
-    #         # @testset "3D (changing datatype) (Complex)" begin
-    #         #     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-    #         #     Vz     = zeros(nx,ny,nz+1);
-    #         #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-    #         #     Vz_ref = copy(Vz);
-    #         #     Vx     = zeros(ComplexF64,nx+1,ny,nz);
-    #         #     Vx    .= [(1+im)*(z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx)) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
-    #         #     Vx_ref = copy(Vx);
-    #         #     Vz[[1, end],       :,       :] .= 0.0;
-    #         #     Vz[       :,[1, end],       :] .= 0.0;
-    #         #     Vz[       :,       :,[1, end]] .= 0.0;
-    #         #     Vz     = Array(Vz);
-    #         #     Vz_ref = Array(Vz_ref);
-    #         #     @require !all(Vz .== Vz_ref)
-    #         #     update_halo!(Vz);
-    #         #     @test all(Vz .== Vz_ref)
-    #         #     Vx[[1, end],       :,       :] .= 0.0;
-    #         #     Vx[       :,[1, end],       :] .= 0.0;
-    #         #     Vx[       :,       :,[1, end]] .= 0.0;
-    #         #     Vx     = Array(Vx);
-    #         #     Vx_ref = Array(Vx_ref);
-    #         #     @require !all(Vx .== Vx_ref)
-    #         #     update_halo!(Vx);
-    #         #     @test all(Vx .== Vx_ref)
-    #         #     #TODO: added for GPU - quick fix:
-    #         #     Vz     = zeros(nx,ny,nz+1);
-    #         #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-    #         #     Vz_ref = copy(Vz);
-    #         #     Vz[[1, end],       :,       :] .= 0.0;
-    #         #     Vz[       :,[1, end],       :] .= 0.0;
-    #         #     Vz[       :,       :,[1, end]] .= 0.0;
-    #         #     Vz     = Array(Vz);
-    #         #     Vz_ref = Array(Vz_ref);
-    #         #     @require !all(Vz .== Vz_ref)
-    #         #     update_halo!(Vz);
-    #         #     @test all(Vz .== Vz_ref)
-    #         #     finalize_global_grid(finalize_MPI=false);
-    #         # end;
-    #         @testset "3D (two fields simultaneously)" begin
-    #             init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
-    #             Vz     = zeros(nx,ny,nz+1);
-    #             Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
-    #             Vz_ref = copy(Vz);
-    #             Vx     = zeros(nx+1,ny,nz);
-    #             Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
-    #             Vx_ref = copy(Vx);
-    #             Vz[[1, end],       :,       :] .= 0.0;
-    #             Vz[       :,[1, end],       :] .= 0.0;
-    #             Vz[       :,       :,[1, end]] .= 0.0;
-    #             Vx[[1, end],       :,       :] .= 0.0;
-    #             Vx[       :,[1, end],       :] .= 0.0;
-    #             Vx[       :,       :,[1, end]] .= 0.0;
-    #             Vz     = Array(Vz);
-    #             Vz_ref = Array(Vz_ref);
-    #             Vx     = Array(Vx);
-    #             Vx_ref = Array(Vx_ref);
-    #             @require !all(CPUArray(Vz .== Vz_ref))
-    #             @require !all(CPUArray(Vx .== Vx_ref))
-    #             update_halo!(Vz, Vx);
-    #             @test all(CPUArray(Vz .== Vz_ref))
-    #             @test all(CPUArray(Vx .== Vx_ref))
-    #             finalize_global_grid(finalize_MPI=false);
-    #         end;
-    #     end;
-    # end;
+    @testset "4. halo update ($array_type arrays)" for (array_type, device_type, Array) in zip(array_types, device_types, ArrayConstructors)
+        @testset "basic grid (default: periodic)" begin
+            @testset "1D" begin
+                init_global_grid(nx, 1, 1; periodx=1, quiet=true, init_MPI=false, device_type=device_type);
+                P     = zeros(nx);
+                P    .= [x_g(ix,dx,P) for ix=1:size(P,1)];
+                P_ref = copy(P);
+                P[[1, end]] .= 0.0;
+                P     = Array(P);
+                P_ref = Array(P_ref);
+                @require !all(CPUArray(P .== P_ref)) # DEBUG: CPUArray needed here and onwards as mapreduce! is failing on AMDGPU (see https://github.com/JuliaGPU/AMDGPU.jl/issues/210)
+                update_halo!(P);
+                @test all(CPUArray(P .== P_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "2D" begin
+                init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
+                P     = zeros(nx, ny);
+                P    .= [y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2)];
+                P_ref = copy(P);
+                P[[1, end],       :] .= 0.0;
+                P[       :,[1, end]] .= 0.0;
+                P     = Array(P);
+                P_ref = Array(P_ref);
+                @require !all(CPUArray(P .== P_ref))
+                update_halo!(P);
+                @test all(CPUArray(P .== P_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "3D" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+                P     = zeros(nx, ny, nz);
+                P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+                P_ref = copy(P);
+                P[[1, end],       :,       :] .= 0.0;
+                P[       :,[1, end],       :] .= 0.0;
+                P[       :,       :,[1, end]] .= 0.0;
+                P     = Array(P);
+                P_ref = Array(P_ref);
+                @require !all(CPUArray(P .== P_ref))
+                update_halo!(P);
+                @test all(CPUArray(P .== P_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "3D (non-default overlap)" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=4, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                P     = zeros(nx, ny, nz);
+                P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+                P_ref = copy(P);
+                P[[1, end],       :,       :] .= 0.0;
+                P[       :,[1, end],       :] .= 0.0;
+                P[       :,       :,[1, end]] .= 0.0;
+                P     = Array(P);
+                P_ref = Array(P_ref);
+                @require !all(CPUArray(P .== P_ref))
+                update_halo!(P);
+                @test all(CPUArray(P .== P_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "3D (not periodic)" begin
+                me, dims, nprocs, coords = init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
+                P     = zeros(nx, ny, nz);
+                P    .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)];
+                P_ref = copy(P);
+                P[[1, end],       :,       :] .= 0.0;
+                P[       :,[1, end],       :] .= 0.0;
+                P[       :,       :,[1, end]] .= 0.0;
+                P     = Array(P);
+                P_ref = Array(P_ref);
+                @require !all(CPUArray(P .== P_ref))
+                update_halo!(P);
+                @test all(CPUArray(P[2:end-1,2:end-1,2:end-1] .== P_ref[2:end-1,2:end-1,2:end-1]))
+                if (coords[1] ==         0) @test all(CPUArray(P[  1,  :,  :] .== 0.0)); else @test all(CPUArray(P[      1,2:end-1,2:end-1] .== P_ref[      1,2:end-1,2:end-1])); end  # Verifcation of corner values would be cumbersome here; it is already sufficiently covered in the periodic tests.
+                if (coords[1] == dims[1]-1) @test all(CPUArray(P[end,  :,  :] .== 0.0)); else @test all(CPUArray(P[    end,2:end-1,2:end-1] .== P_ref[    end,2:end-1,2:end-1])); end
+                if (coords[2] ==         0) @test all(CPUArray(P[  :,  1,  :] .== 0.0)); else @test all(CPUArray(P[2:end-1,      1,2:end-1] .== P_ref[2:end-1,      1,2:end-1])); end
+                if (coords[2] == dims[2]-1) @test all(CPUArray(P[  :,end,  :] .== 0.0)); else @test all(CPUArray(P[2:end-1,    end,2:end-1] .== P_ref[2:end-1,    end,2:end-1])); end
+                if (coords[3] ==         0) @test all(CPUArray(P[  :,  :,  1] .== 0.0)); else @test all(CPUArray(P[2:end-1,2:end-1,      1] .== P_ref[2:end-1,2:end-1,      1])); end
+                if (coords[3] == dims[3]-1) @test all(CPUArray(P[  :,  :,end] .== 0.0)); else @test all(CPUArray(P[2:end-1,2:end-1,    end] .== P_ref[2:end-1,2:end-1,    end])); end
+                finalize_global_grid(finalize_MPI=false);
+            end;
+        end;
+        @testset "staggered grid (default: periodic)" begin
+            @testset "1D" begin
+                init_global_grid(nx, 1, 1; periodx=1, quiet=true, init_MPI=false, device_type=device_type);
+                Vx     = zeros(nx+1);
+                Vx    .= [x_g(ix,dx,Vx) for ix=1:size(Vx,1)];
+                Vx_ref = copy(Vx);
+                Vx[[1, end]] .= 0.0;
+                Vx     = Array(Vx);
+                Vx_ref = Array(Vx_ref);
+                @require !all(CPUArray(Vx .== Vx_ref))
+                update_halo!(Vx);
+                @test all(CPUArray(Vx .== Vx_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "2D" begin
+                init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
+                Vy     = zeros(nx,ny+1);
+                Vy    .= [y_g(iy,dy,Vy)*1e1 + x_g(ix,dx,Vy) for ix=1:size(Vy,1), iy=1:size(Vy,2)];
+                Vy_ref = copy(Vy);
+                Vy[[1, end],       :] .= 0.0;
+                Vy[       :,[1, end]] .= 0.0;
+                Vy     = Array(Vy);
+                Vy_ref = Array(Vy_ref);
+                @require !all(CPUArray(Vy .== Vy_ref))
+                update_halo!(Vy);
+                @test all(CPUArray(Vy .== Vy_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "3D" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+                Vz     = zeros(nx,ny,nz+1);
+                Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+                Vz_ref = copy(Vz);
+                Vz[[1, end],       :,       :] .= 0.0;
+                Vz[       :,[1, end],       :] .= 0.0;
+                Vz[       :,       :,[1, end]] .= 0.0;
+                Vz     = Array(Vz);
+                Vz_ref = Array(Vz_ref);
+                @require !all(CPUArray(Vz .== Vz_ref))
+                update_halo!(Vz);
+                @test all(CPUArray(Vz .== Vz_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "3D (non-default overlap)" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=3, overlapz=3, quiet=true, init_MPI=false, device_type=device_type);
+                Vx     = zeros(nx+1,ny,nz);
+                Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+                Vx_ref = copy(Vx);
+                Vx[[1, end],       :,       :] .= 0.0;
+                Vx[       :,[1, end],       :] .= 0.0;
+                Vx[       :,       :,[1, end]] .= 0.0;
+                Vx     = Array(Vx);
+                Vx_ref = Array(Vx_ref);
+                @require !all(CPUArray(Vx .== Vx_ref))
+                update_halo!(Vx);
+                @test all(CPUArray(Vx .== Vx_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "3D (not periodic)" begin
+                me, dims, nprocs, coords = init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type);
+                Vz     = zeros(nx,ny,nz+1);
+                Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+                Vz_ref = copy(Vz);
+                Vz[[1, end],       :,       :] .= 0.0;
+                Vz[       :,[1, end],       :] .= 0.0;
+                Vz[       :,       :,[1, end]] .= 0.0;
+                Vz     = Array(Vz);
+                Vz_ref = Array(Vz_ref);
+                @require !all(CPUArray(Vz .== Vz_ref))
+                update_halo!(Vz);
+                @test all(CPUArray(Vz[2:end-1,2:end-1,2:end-1] .== Vz_ref[2:end-1,2:end-1,2:end-1]))
+                if (coords[1] ==         0) @test all(CPUArray(Vz[  1,  :,  :] .== 0.0)); else @test all(CPUArray(Vz[      1,2:end-1,2:end-1] .== Vz_ref[      1,2:end-1,2:end-1])); end  # Verifcation of corner values would be cumbersome here; it is already sufficiently covered in the periodic tests.
+                if (coords[1] == dims[1]-1) @test all(CPUArray(Vz[end,  :,  :] .== 0.0)); else @test all(CPUArray(Vz[    end,2:end-1,2:end-1] .== Vz_ref[    end,2:end-1,2:end-1])); end
+                if (coords[2] ==         0) @test all(CPUArray(Vz[  :,  1,  :] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,      1,2:end-1] .== Vz_ref[2:end-1,      1,2:end-1])); end
+                if (coords[2] == dims[2]-1) @test all(CPUArray(Vz[  :,end,  :] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,    end,2:end-1] .== Vz_ref[2:end-1,    end,2:end-1])); end
+                if (coords[3] ==         0) @test all(CPUArray(Vz[  :,  :,  1] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,2:end-1,      1] .== Vz_ref[2:end-1,2:end-1,      1])); end
+                if (coords[3] == dims[3]-1) @test all(CPUArray(Vz[  :,  :,end] .== 0.0)); else @test all(CPUArray(Vz[2:end-1,2:end-1,    end] .== Vz_ref[2:end-1,2:end-1,    end])); end
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "2D (no halo in one dim)" begin
+                init_global_grid(nx, ny, 1; periodx=1, periody=1, quiet=true, init_MPI=false, device_type=device_type);
+                A     = zeros(nx-1,ny+2);
+                A    .= [y_g(iy,dy,A)*1e1 + x_g(ix,dx,A) for ix=1:size(A,1), iy=1:size(A,2)];
+                A_ref = copy(A);
+                A[[1, end],       :] .= 0.0;
+                A[       :,[1, end]] .= 0.0;
+                A     = Array(A);
+                A_ref = Array(A_ref);
+                @require !all(CPUArray(A .== A_ref))
+                update_halo!(A);
+                @test all(CPUArray(A[2:end-1,:] .== A_ref[2:end-1,:]))
+                @test all(CPUArray(A[[1, end],:] .== 0.0))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "3D (no halo in one dim)" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+                A     = zeros(nx+2,ny-1,nz+1);
+                A    .= [z_g(iz,dz,A)*1e2 + y_g(iy,dy,A)*1e1 + x_g(ix,dx,A) for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)];
+                A_ref = copy(A);
+                A[[1, end],       :,       :] .= 0.0;
+                A[       :,[1, end],       :] .= 0.0;
+                A[       :,       :,[1, end]] .= 0.0;
+                A     = Array(A);
+                A_ref = Array(A_ref);
+                @require !all(CPUArray(A .== A_ref))
+                update_halo!(A);
+                @test all(CPUArray(A[:,2:end-1,:] .== A_ref[:,2:end-1,:]))
+                @test all(CPUArray(A[:,[1, end],:] .== 0.0))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "3D (Complex)" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+                Vz     = zeros(ComplexF16,nx,ny,nz+1);
+                Vz    .= [(1+im)*(z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz)) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+                Vz_ref = copy(Vz);
+                Vz[[1, end],       :,       :] .= 0.0;
+                Vz[       :,[1, end],       :] .= 0.0;
+                Vz[       :,       :,[1, end]] .= 0.0;
+                Vz     = Array(Vz);
+                Vz_ref = Array(Vz_ref);
+                @require !all(CPUArray(Vz .== Vz_ref))
+                update_halo!(Vz);
+                @test all(CPUArray(Vz .== Vz_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            # @testset "3D (changing datatype)" begin
+            #     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+            #     Vz     = zeros(nx,ny,nz+1);
+            #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+            #     Vz_ref = copy(Vz);
+            #     Vx     = zeros(Float32,nx+1,ny,nz);
+            #     Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+            #     Vx_ref = copy(Vx);
+            #     Vz[[1, end],       :,       :] .= 0.0;
+            #     Vz[       :,[1, end],       :] .= 0.0;
+            #     Vz[       :,       :,[1, end]] .= 0.0;
+            #     Vz     = Array(Vz);
+            #     Vz_ref = Array(Vz_ref);
+            #     @require !all(Vz .== Vz_ref)
+            #     update_halo!(Vz);
+            #     @test all(Vz .== Vz_ref)
+            #     Vx[[1, end],       :,       :] .= 0.0;
+            #     Vx[       :,[1, end],       :] .= 0.0;
+            #     Vx[       :,       :,[1, end]] .= 0.0;
+            #     Vx     = Array(Vx);
+            #     Vx_ref = Array(Vx_ref);
+            #     @require !all(Vx .== Vx_ref)
+            #     update_halo!(Vx);
+            #     @test all(Vx .== Vx_ref)
+            #     #TODO: added for GPU - quick fix:
+            #     Vz     = zeros(nx,ny,nz+1);
+            #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+            #     Vz_ref = copy(Vz);
+            #     Vz[[1, end],       :,       :] .= 0.0;
+            #     Vz[       :,[1, end],       :] .= 0.0;
+            #     Vz[       :,       :,[1, end]] .= 0.0;
+            #     Vz     = Array(Vz);
+            #     Vz_ref = Array(Vz_ref);
+            #     @require !all(Vz .== Vz_ref)
+            #     update_halo!(Vz);
+            #     @test all(Vz .== Vz_ref)
+            #     finalize_global_grid(finalize_MPI=false);
+            # end;
+            # @testset "3D (changing datatype) (Complex)" begin
+            #     init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+            #     Vz     = zeros(nx,ny,nz+1);
+            #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+            #     Vz_ref = copy(Vz);
+            #     Vx     = zeros(ComplexF64,nx+1,ny,nz);
+            #     Vx    .= [(1+im)*(z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx)) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+            #     Vx_ref = copy(Vx);
+            #     Vz[[1, end],       :,       :] .= 0.0;
+            #     Vz[       :,[1, end],       :] .= 0.0;
+            #     Vz[       :,       :,[1, end]] .= 0.0;
+            #     Vz     = Array(Vz);
+            #     Vz_ref = Array(Vz_ref);
+            #     @require !all(Vz .== Vz_ref)
+            #     update_halo!(Vz);
+            #     @test all(Vz .== Vz_ref)
+            #     Vx[[1, end],       :,       :] .= 0.0;
+            #     Vx[       :,[1, end],       :] .= 0.0;
+            #     Vx[       :,       :,[1, end]] .= 0.0;
+            #     Vx     = Array(Vx);
+            #     Vx_ref = Array(Vx_ref);
+            #     @require !all(Vx .== Vx_ref)
+            #     update_halo!(Vx);
+            #     @test all(Vx .== Vx_ref)
+            #     #TODO: added for GPU - quick fix:
+            #     Vz     = zeros(nx,ny,nz+1);
+            #     Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+            #     Vz_ref = copy(Vz);
+            #     Vz[[1, end],       :,       :] .= 0.0;
+            #     Vz[       :,[1, end],       :] .= 0.0;
+            #     Vz[       :,       :,[1, end]] .= 0.0;
+            #     Vz     = Array(Vz);
+            #     Vz_ref = Array(Vz_ref);
+            #     @require !all(Vz .== Vz_ref)
+            #     update_halo!(Vz);
+            #     @test all(Vz .== Vz_ref)
+            #     finalize_global_grid(finalize_MPI=false);
+            # end;
+            @testset "3D (two fields simultaneously)" begin
+                init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, quiet=true, init_MPI=false, device_type=device_type);
+                Vz     = zeros(nx,ny,nz+1);
+                Vz    .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)];
+                Vz_ref = copy(Vz);
+                Vx     = zeros(nx+1,ny,nz);
+                Vx    .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)];
+                Vx_ref = copy(Vx);
+                Vz[[1, end],       :,       :] .= 0.0;
+                Vz[       :,[1, end],       :] .= 0.0;
+                Vz[       :,       :,[1, end]] .= 0.0;
+                Vx[[1, end],       :,       :] .= 0.0;
+                Vx[       :,[1, end],       :] .= 0.0;
+                Vx[       :,       :,[1, end]] .= 0.0;
+                Vz     = Array(Vz);
+                Vz_ref = Array(Vz_ref);
+                Vx     = Array(Vx);
+                Vx_ref = Array(Vx_ref);
+                @require !all(CPUArray(Vz .== Vz_ref))
+                @require !all(CPUArray(Vx .== Vx_ref))
+                update_halo!(Vz, Vx);
+                @test all(CPUArray(Vz .== Vz_ref))
+                @test all(CPUArray(Vx .== Vx_ref))
+                finalize_global_grid(finalize_MPI=false);
+            end;
+        end;
+    end;
 end;
 
 ## Test tear down

From d3330b220da8517c35eeb4e00f3a4f5e8d3e05bd Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Tue, 18 Jul 2023 23:39:53 +0300
Subject: [PATCH 08/21] Hotfix to circumvent mapreduce issue on AMDGPU

---
 test/test_update_halo.jl | 76 ++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index 4344fa7..18f82d1 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -427,8 +427,8 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[2,:,:][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[2,:,:][:]))) # DEBUG: here and later, CPUArray is needed to avoid error in AMDGPU because of mapreduce
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0))
                 else
                     @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[2,:,:][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
@@ -439,8 +439,8 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[end-1,:,:][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[end-1,:,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0))
                 else
                     @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[end-1,:,:][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0)
@@ -452,8 +452,8 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,2,:][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,4,:][:]))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,2,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,4,:][:])))
                 else
                     @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,2,:][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,4,:][:]))
@@ -464,8 +464,8 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,end-1,:][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,end-3,:][:]))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,end-1,:][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,end-3,:][:])))
                 else
                     @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,end-1,:][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,end-3,:][:]))
@@ -477,8 +477,8 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,3][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,4][:]))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,3][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,4][:])))
                 else
                     @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,3][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,4][:]))
@@ -489,8 +489,8 @@ dz = 1.0
                 GG.wait_iwrite(n, P, 1);
                 GG.wait_iwrite(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,end-2][:]))
-                    @test all(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,end-3][:]))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,end-2][:])))
+                    @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,end-3][:])))
                 else
                     @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end-2][:]))
                     @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end-3][:]))
@@ -522,8 +522,8 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[1,:,:][:]))
-                    @test all(                          0.0 .== Array(A[1,:,:][:]))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[1,:,:][:])))
+                    @test all(CPUArray(                          0.0 .== Array(A[1,:,:][:])))
                 else
                     @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[1,:,:][:]))
                     @test all(                       0.0 .== CPUArray(A[1,:,:][:]))
@@ -534,8 +534,8 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[end,:,:][:]))
-                    @test all(                          0.0 .== Array(A[end,:,:][:]))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[end,:,:][:])))
+                    @test all(CPUArray(                          0.0 .== Array(A[end,:,:][:])))
                 else
                     @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[end,:,:][:]))
                     @test all(                       0.0 .== CPUArray(A[end,:,:][:]))
@@ -556,8 +556,8 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,1,:][:]))
-                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,1,:][:]))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,1,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,1,:][:])))
                 else
                     @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,1,:][:]))
                     @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,1,:][:]))
@@ -568,8 +568,8 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,end,:][:]))
-                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,end,:][:]))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,end,:][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,end,:][:])))
                 else
                     @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,end,:][:]))
                     @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,end,:][:]))
@@ -590,8 +590,8 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,1][:]))
-                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,1][:]))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,1][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,1][:])))
                 else
                     @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,1][:]))
                     @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,1][:]))
@@ -602,8 +602,8 @@ dz = 1.0
                 GG.wait_iread(n, P, 1);
                 GG.wait_iread(n, A, 2);
                 if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                    @test all(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,end][:]))
-                    @test all(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,end][:]))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,end][:])))
+                    @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,end][:])))
                 else
                     @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end][:]))
                     @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end][:]))
@@ -631,10 +631,10 @@ dz = 1.0
                         GG.sendrecv_halo_local(n, dim, A, 2);
                     end
                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
-                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
+                        @test all(CPUArray(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P)));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(1,dim,2,A) .== 0.0));  # There is no halo (ol(dim,A) < 2).
+                        @test all(CPUArray(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P)));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(2,dim,2,A) .== 0.0));  # There is no halo (ol(dim,A) < 2).
                     else
                         @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
                         @test all(GG.recvbuf_flat(1,dim,2,A) .== 0.0);  # There is no halo (ol(dim,A) < 2).
@@ -656,10 +656,10 @@ dz = 1.0
                         GG.sendrecv_halo_local(n, dim, A, 2);
                     end
                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
-                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P)));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A)));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P)));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A)));
                     else
                         @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
                         @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
@@ -681,10 +681,10 @@ dz = 1.0
                         GG.sendrecv_halo_local(n, dim, A, 2);
                     end
                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        @test all(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A));
-                        @test all(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P));
-                        @test all(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(1,dim,1,P) .== GG.gpusendbuf_flat(2,dim,1,P)));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(1,dim,2,A) .== GG.gpusendbuf_flat(2,dim,2,A)));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(2,dim,1,P) .== GG.gpusendbuf_flat(1,dim,1,P)));
+                        @test all(CPUArray(GG.gpurecvbuf_flat(2,dim,2,A) .== GG.gpusendbuf_flat(1,dim,2,A)));
                     else
                         @test all(GG.recvbuf_flat(1,dim,1,P) .== GG.sendbuf_flat(2,dim,1,P));
                         @test all(GG.recvbuf_flat(1,dim,2,A) .== GG.sendbuf_flat(2,dim,2,A));
@@ -726,8 +726,8 @@ dz = 1.0
                 MPI.Waitall!(reqs[:]);
                 for n = 1:nneighbors_per_dim
                     if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
-                        @test all(GG.gpurecvbuf(n,dim,1,P) .== 9.0)
-                        @test all(GG.gpurecvbuf(n,dim,2,A) .== 9.0)
+                        @test all(CPUArray(GG.gpurecvbuf(n,dim,1,P) .== 9.0))
+                        @test all(CPUArray(GG.gpurecvbuf(n,dim,2,A) .== 9.0))
                     else
                         @test all(GG.recvbuf(n,dim,1,P) .== 9.0)
                         @test all(GG.recvbuf(n,dim,2,A) .== 9.0)

From aaf7ccc6402e45b8d59c3fef65bb46956d692d33 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Wed, 19 Jul 2023 10:44:14 +0300
Subject: [PATCH 09/21] Fixup update halo tests for multi-GPUs aware MPI

---
 test/test_update_halo.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index 18f82d1..d10527c 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -715,6 +715,12 @@ dz = 1.0
                         GG.recvbuf(n,dim,2,A) .= 0;
                     end
                 end
+                # DEBUG: Filling arrays is async (at least on AMDGPU); sync is needed.
+                if (array_type=="CUDA" && GG.cudaaware_MPI(dim))
+                    CUDA.synchronize()
+                elseif (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim))
+                    AMDGPU.synchronize()
+                end
                 reqs  = fill(MPI.REQUEST_NULL, 2, nneighbors_per_dim, 2);
                 for n = 1:nneighbors_per_dim
                     reqs[1,n,1] = GG.irecv_halo!(n, dim, P, 1);

From a8af59d1d01b7bc021778f0cc8afbadedcc2cc56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ludovic=20R=C3=A4ss?=
 <61313342+luraess@users.noreply.github.com>
Date: Wed, 19 Jul 2023 10:05:43 +0200
Subject: [PATCH 10/21] Fix device selection in AMDGPU

use `device_id!`
---
 src/select_device.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/select_device.jl b/src/select_device.jl
index 123b71c..a571c7e 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -27,7 +27,7 @@ function select_device()
         me_l      = MPI.Comm_rank(comm_l)
         device_id = amdgpu_enabled() ? me_l+1 : me_l
         if     cuda_enabled()   CUDA.device!(device_id)
-        elseif amdgpu_enabled() AMDGPU.device!(device_id)
+        elseif amdgpu_enabled() AMDGPU.device_id!(device_id)
         end
         return device_id
     else

From d0c9348cf3c53ec4d158d568c0d02b8bf27995ff Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Wed, 19 Jul 2023 11:13:20 +0300
Subject: [PATCH 11/21] Restore test file selection

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 8697640..a6a5800 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,7 +8,7 @@ excludedfiles = ["test_excluded.jl"];
 function runtests()
     exename   = joinpath(Sys.BINDIR, Base.julia_exename())
     testdir   = pwd()
-    istest(f) = endswith(f, ".jl") && startswith(f, "test_up")
+    istest(f) = endswith(f, ".jl") && startswith(f, "test_")
     testfiles = sort(filter(istest, readdir(testdir)))
 
     nfail = 0

From 075593c5fbae80dbf0faa0d714804d7abd6f60c1 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Wed, 19 Jul 2023 11:34:41 +0300
Subject: [PATCH 12/21] Fix CI

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index efe0732..260cc29 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,9 +20,9 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.7' # Minimum required Julia version (due to dependency of AMDGPU.jl)
+          - '1.8' # Minimum required Julia version (due to dependency of AMDGPU.jl)
           - '1'   # Latest stable 1.x release of Julia
-          # - 'nightly'
+          - 'nightly'
         os:
           - ubuntu-latest
           - macOS-latest

From 243ae207816955f7895f781b6db3672aec5883e4 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Wed, 19 Jul 2023 11:56:29 +0300
Subject: [PATCH 13/21] Fixup CI

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 260cc29..9864d54 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,7 +20,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.8' # Minimum required Julia version (due to dependency of AMDGPU.jl)
+          # - '1.7' # Skipping this version because of AMDGPU deps compat issue (rocBLAS_jll)
           - '1'   # Latest stable 1.x release of Julia
           - 'nightly'
         os:

From 8f5c01bc92d151cfdbe820f2eb3dad548ced30d4 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Thu, 20 Jul 2023 18:35:57 +0300
Subject: [PATCH 14/21] Fixup unsafe_copy3d

---
 src/update_halo.jl       | 94 +++++++++++++++++++---------------------
 test/test_update_halo.jl | 37 ++++++++--------
 2 files changed, 62 insertions(+), 69 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 325fb9e..44f0716 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -99,8 +99,7 @@ let
     curecvbufs_raw_h = nothing
     rocsendbufs_raw = nothing
     rocrecvbufs_raw = nothing
-    # rocsendbufs_raw_h = nothing
-    # rocrecvbufs_raw_h = nothing
+    # INFO: no need for roc host buffers
 
     function free_update_halo_buffers()
         if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(cusendbufs_raw) end
@@ -109,8 +108,7 @@ let
         if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(curecvbufs_raw_h) end
         if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocsendbufs_raw) end
         if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocrecvbufs_raw) end
-        # if (amdgpu_enabled() && none(amdgpuaware_MPI())) unregister_gpubufs(rocsendbufs_raw_h) end
-        # if (amdgpu_enabled() && none(amdgpuaware_MPI())) unregister_gpubufs(rocrecvbufs_raw_h) end
+        # INFO: no need to unregister roc host buffers
         sendbufs_raw = nothing
         recvbufs_raw = nothing
         cusendbufs_raw = nothing
@@ -119,8 +117,7 @@ let
         curecvbufs_raw_h = nothing
         rocsendbufs_raw = nothing
         rocrecvbufs_raw = nothing
-        # rocsendbufs_raw_h = nothing
-        # rocrecvbufs_raw_h = nothing
+        # INFO: no need for roc host buffers
         GC.gc()
     end
 
@@ -143,7 +140,7 @@ let
             for i = 1:length(bufs)
                 for n = 1:length(bufs[i])
                     if (isa(bufs[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end
-                    # if (isa(bufs[i][n],AMDGPU.Mem.HostBuffer)) AMDGPU.HIP.hipHostUnregister(bufs[i][n]); bufs[i][n] = []; end
+                    # INFO: no need for roc host buffers
                 end
             end
         end
@@ -252,15 +249,13 @@ let
     function init_rocbufs_arrays()
         rocsendbufs_raw = Array{Array{Any,1},1}();
         rocrecvbufs_raw = Array{Array{Any,1},1}();
-        # rocsendbufs_raw_h = Array{Array{Any,1},1}();
-        # rocrecvbufs_raw_h = Array{Array{Any,1},1}();
+        # INFO: no need for roc host buffers
     end
 
     function init_rocbufs(T::DataType, fields::GGArray...)
         while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
         while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
-        # while (length(rocsendbufs_raw_h) < length(fields)) push!(rocsendbufs_raw_h, [[], []]); end
-        # while (length(rocrecvbufs_raw_h) < length(fields)) push!(rocrecvbufs_raw_h, [[], []]); end
+        # INFO: no need for roc host buffers
     end
 
     function reinterpret_rocbufs(T::DataType, i::Integer, n::Integer)
@@ -274,10 +269,7 @@ let
     end
 
     function reregister_rocbufs(T::DataType, i::Integer, n::Integer)
-        # if (isa(rocsendbufs_raw_h[i][n],AMDGPU.Mem.HostBuffer)) AMDGPU.HIP.hipHostUnregister(rocsendbufs_raw_h[i][n]); rocsendbufs_raw_h[i][n] = []; end
-        # if (isa(rocrecvbufs_raw_h[i][n],AMDGPU.Mem.HostBuffer)) AMDGPU.HIP.hipHostUnregister(rocrecvbufs_raw_h[i][n]); rocrecvbufs_raw_h[i][n] = []; end
-        # rocsendbufs_raw[i][n], rocsendbufs_raw_h[i][n] = register(ROCArray,sendbufs_raw[i][n]);
-        # rocrecvbufs_raw[i][n], rocrecvbufs_raw_h[i][n] = register(ROCArray,recvbufs_raw[i][n]);
+        # INFO: no need for roc host buffers
         rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]);
         rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]);
     end
@@ -500,15 +492,15 @@ let
     function iwrite_sendbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
-            # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+            if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
                 ranges = sendranges(n, dim, A);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                 @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
-            # else
-            #     write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
-            # end
+            else
+                write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
+            end
         end
     end
 end
@@ -529,15 +521,15 @@ let
     function iread_recvbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
-            # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+            if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
                 ranges = recvranges(n, dim, A);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                 @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
-            # else
-            #     read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
-            # end
+            else
+                read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
+            end
         end
     end
 
@@ -683,33 +675,35 @@ function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrang
     return nothing
 end
 
-# # Write to the send buffer on the host from the array on the device (d2h).
-# function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, signal::HSASignal) where T <: GGNumber where T2 <: Integer
-#     locked_ptr = convert(Ptr{T}, AMDGPU.Mem.lock(pointer(sendbuf),sizeof(sendbuf),get_default_agent()))
-#     AMDGPU.Mem.unsafe_copy3d!(
-#         locked_ptr, pointer(A),
-#         length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
-#         srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
-#         srcPitch=sizeof(T)*size(A,1), srcSlice=sizeof(T)*size(A,1)*size(A,2),
-#         dstPitch=sizeof(T)*length(sendranges[1]), dstSlice=sizeof(T)*length(sendranges[1])*length(sendranges[2]),
-#         async=true, signal=signal
-#     )
-#     return nothing
-# end
-
-# # Read from the receive buffer on the host and store on the array on the device (h2d).
-# function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, signal::HSASignal) where T <: GGNumber where T2 <: Integer
-#     locked_ptr = convert(Ptr{T}, AMDGPU.Mem.lock(pointer(recvbuf),sizeof(recvbuf),get_default_agent()))
-#     AMDGPU.Mem.unsafe_copy3d!(
-#         pointer(A), locked_ptr,
-#         length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
-#         dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
-#         srcPitch=sizeof(T)*length(recvranges[1]), srcSlice=sizeof(T)*length(recvranges[1])*length(recvranges[2]),
-#         dstPitch=sizeof(T)*size(A,1), dstSlice=sizeof(T)*size(A,1)size(A,2),
-#         async=true, signal=signal
-#     )
-#     return nothing
-# end
+# Write to the send buffer on the host from the array on the device (d2h).
+function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    buf_view = reshape(sendbuf, Tuple(length.(sendranges)))
+    AMDGPU.Mem.unsafe_copy3d!(
+        pointer(sendbuf), AMDGPU.Mem.HostBuffer,
+        pointer(A), typeof(A.buf),
+        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+        dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
+        srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2),
+        async=true, stream=rocstream
+    )
+    return nothing
+end
+
+# Read from the receive buffer on the host and store on the array on the device (h2d).
+function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    buf_view = reshape(recvbuf, Tuple(length.(recvranges)))
+    AMDGPU.Mem.unsafe_copy3d!(
+        pointer(A), typeof(A.buf),
+        pointer(recvbuf), AMDGPU.Mem.HostBuffer,
+        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+        dstPitch=sizeof(T) * size(A,1), dstHeight=size(A, 2),
+        srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2),
+        async=true, stream=rocstream
+    )
+    return nothing
+end
 
 ##------------------------------
 ## FUNCTIONS TO SEND/RECV FIELDS
diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index d10527c..f646324 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -341,7 +341,6 @@ dz = 1.0
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         CUDA.Mem.unregister(buf_h);
                     elseif array_type == "AMDGPU"
-                        @info "needs async memcopy fix"
                         # (dim=1)
                         dim = 1;
                         P2  = gpuzeros(eltype(P),size(P));
@@ -357,12 +356,12 @@ dz = 1.0
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         buf .= 0.0;
                         P2  .= 0.0;
-                        # rocstream = AMDGPU.HIPStream();
-                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        # AMDGPU.unsafe_free!(buf_d);
+                        rocstream = AMDGPU.HIPStream();
+                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        AMDGPU.unsafe_free!(buf_d);
                         # (dim=2)
                         dim = 2;
                         P2  = gpuzeros(eltype(P),size(P));
@@ -378,12 +377,12 @@ dz = 1.0
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         buf .= 0.0;
                         P2  .= 0.0;
-                        # rocstream = AMDGPU.HIPStream();
-                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        # AMDGPU.unsafe_free!(buf_d);
+                        rocstream = AMDGPU.HIPStream();
+                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        AMDGPU.unsafe_free!(buf_d);
                         # (dim=3)
                         dim = 3
                         P2  = gpuzeros(eltype(P),size(P));
@@ -399,12 +398,12 @@ dz = 1.0
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
                         buf .= 0.0;
                         P2  .= 0.0;
-                        # rocstream = AMDGPU.HIPStream();
-                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        # AMDGPU.unsafe_free!(buf_d);
+                        rocstream = AMDGPU.HIPStream();
+                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        AMDGPU.unsafe_free!(buf_d);
                     end
                     finalize_global_grid(finalize_MPI=false);
                 end;

From 985ab7206d85e5e7748ce1878ee1102501a87435 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Thu, 20 Jul 2023 22:40:45 +0300
Subject: [PATCH 15/21] Fix style

---
 src/update_halo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index 44f0716..ae12686 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -698,7 +698,7 @@ function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::
         pointer(recvbuf), AMDGPU.Mem.HostBuffer,
         length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
         dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
-        dstPitch=sizeof(T) * size(A,1), dstHeight=size(A, 2),
+        dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2),
         srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2),
         async=true, stream=rocstream
     )

From ec01373a3a5ae2e15fc68b21a4a898c4bcd8314a Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Sat, 22 Jul 2023 00:55:33 +0300
Subject: [PATCH 16/21] Bump Julia version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 532ba78..495398f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,7 @@ AMDGPU = "0.5"
 CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4"
 LoopVectorization = "0.12"
 MPI = "0.20"
-julia = "1.7"
+julia = "1.9"
 
 [deps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"

From 65cf660cb4bc198aafc6ae716e997e59fb37b6e9 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Sat, 22 Jul 2023 00:57:40 +0300
Subject: [PATCH 17/21] Comment windows test which currently fail on nightly.

AMDGPU not supported - this should be fixed when using Extensions
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9864d54..b8b07fa 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,7 +26,7 @@ jobs:
         os:
           - ubuntu-latest
           - macOS-latest
-          - windows-latest
+          # - windows-latest
         arch:
           - x64
     steps:

From 6ba7e199f8d4b736cc8c516243a577510eeb563a Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Sat, 22 Jul 2023 01:01:08 +0300
Subject: [PATCH 18/21] Fix doc build

---
 docs/Project.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index ffa1855..6365a5b 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,4 +1,3 @@
 [deps]
-ImplicitGlobalGrid = "d35fcfd7-7af4-4c67-b1aa-d78070614af4"
 DocExtensions = "cbdad009-89f1-4e05-85a0-06b07b50707d"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

From 2ef95782f80a69532220d7fae3eaf01b8a8465fe Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Sat, 22 Jul 2023 13:30:27 +0300
Subject: [PATCH 19/21] Comment doc build for now

---
 .github/workflows/ci.yml | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b8b07fa..7c37789 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -51,22 +51,22 @@ jobs:
       - uses: codecov/codecov-action@v2
         with:
           files: lcov.info
-  docs:
-    name: Documentation
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: julia-actions/setup-julia@v1
-        with:
-          version: '1'
-      - uses: julia-actions/julia-buildpkg@v1
-      - uses: julia-actions/julia-docdeploy@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
-      - run: |
-          julia --project=docs -e '
-            using Documenter: DocMeta, doctest
-            using ImplicitGlobalGrid
-            DocMeta.setdocmeta!(ImplicitGlobalGrid, :DocTestSetup, :(using ImplicitGlobalGrid); recursive=true)
-            doctest(ImplicitGlobalGrid)'
+  # docs:
+  #   name: Documentation
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - uses: actions/checkout@v2
+  #     - uses: julia-actions/setup-julia@v1
+  #       with:
+  #         version: '1'
+  #     - uses: julia-actions/julia-buildpkg@v1
+  #     - uses: julia-actions/julia-docdeploy@v1
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #         DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+  #     - run: |
+  #         julia --project=docs -e '
+  #           using Documenter: DocMeta, doctest
+  #           using ImplicitGlobalGrid
+  #           DocMeta.setdocmeta!(ImplicitGlobalGrid, :DocTestSetup, :(using ImplicitGlobalGrid); recursive=true)
+  #           doctest(ImplicitGlobalGrid)'

From 7f151af45c6f52bdeafcab02c107841ac7f7bfe4 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Sat, 22 Jul 2023 16:29:32 +0300
Subject: [PATCH 20/21] Comment AMDGPU async memcpy tests for now.

There may be an issue in underlying HIP function we need to figure out.
---
 test/test_update_halo.jl | 48 ++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index f646324..a737bc1 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -354,14 +354,14 @@ dz = 1.0
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
                         @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        rocstream = AMDGPU.HIPStream();
-                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        AMDGPU.unsafe_free!(buf_d);
+                        # buf .= 0.0; # DEBUG: diabling read_x2x_async! tests for now in AMDGPU backend because there is an issue most likely in HIP
+                        # P2  .= 0.0;
+                        # rocstream = AMDGPU.HIPStream();
+                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        # AMDGPU.unsafe_free!(buf_d);
                         # (dim=2)
                         dim = 2;
                         P2  = gpuzeros(eltype(P),size(P));
@@ -375,14 +375,14 @@ dz = 1.0
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
                         @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        rocstream = AMDGPU.HIPStream();
-                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        AMDGPU.unsafe_free!(buf_d);
+                        # buf .= 0.0; # DEBUG: diabling read_x2x_async! tests for now in AMDGPU backend because there is an issue most likely in HIP
+                        # P2  .= 0.0;
+                        # rocstream = AMDGPU.HIPStream();
+                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        # AMDGPU.unsafe_free!(buf_d);
                         # (dim=3)
                         dim = 3
                         P2  = gpuzeros(eltype(P),size(P));
@@ -396,14 +396,14 @@ dz = 1.0
                         @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
                         @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize();
                         @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        buf .= 0.0;
-                        P2  .= 0.0;
-                        rocstream = AMDGPU.HIPStream();
-                        GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-                        GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
-                        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-                        AMDGPU.unsafe_free!(buf_d);
+                        # buf .= 0.0; # DEBUG: diabling read_x2x_async! tests for now in AMDGPU backend because there is an issue most likely in HIP
+                        # P2  .= 0.0;
+                        # rocstream = AMDGPU.HIPStream();
+                        # GG.write_d2h_async!(buf, P, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
+                        # GG.read_h2d_async!(buf, P2, ranges, rocstream); AMDGPU.synchronize();
+                        # @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
+                        # AMDGPU.unsafe_free!(buf_d);
                     end
                     finalize_global_grid(finalize_MPI=false);
                 end;

From c11672686a78de8b39fa9aa5319b009bbf0d5041 Mon Sep 17 00:00:00 2001
From: Ludovic Raess <ludovic.rass@gmail.com>
Date: Sat, 22 Jul 2023 16:31:12 +0300
Subject: [PATCH 21/21] Commenting async copy for now in read/write buf
 functions

---
 src/update_halo.jl | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/update_halo.jl b/src/update_halo.jl
index ae12686..ad25e7f 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -492,15 +492,16 @@ let
     function iwrite_sendbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
-            if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+            # DEBUG 2: commenting read_h2d_async! for now
+            # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
                 ranges = sendranges(n, dim, A);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                 @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
-            else
-                write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
-            end
+            # else
+            #     write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
+            # end
         end
     end
 end
@@ -521,15 +522,16 @@ let
     function iread_recvbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
         if ol(dim,A) >= 2  # There is only a halo and thus a halo update if the overlap is at least 2...
             # DEBUG: the follow section needs perf testing
-            if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+            # DEBUG 2: commenting read_h2d_async! for now
+            # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
                 ranges = recvranges(n, dim, A);
                 nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
                 halosize = [r[end] - r[1] + 1 for r in ranges];
                 nblocks  = Tuple(ceil.(Int, halosize./nthreads));
                 @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
-            else
-                read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
-            end
+            # else
+            #     read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
+            # end
         end
     end