From 2c6e065b25bb258fb67366430b0f966948c80914 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 16:41:09 +0100
Subject: [PATCH] move code related to accessing buffers to extensions

---
 src/AMDGPUExt/update_halo.jl | 135 ++++++++++++++++++-
 src/CUDAExt/update_halo.jl   | 125 +++++++++++++++++-
 src/update_halo.jl           | 250 -----------------------------------
 3 files changed, 258 insertions(+), 252 deletions(-)

diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
index acbdea7..381f5f4 100644
--- a/src/AMDGPUExt/update_halo.jl
+++ b/src/AMDGPUExt/update_halo.jl
@@ -96,4 +96,137 @@ let
     global get_rocsendbufs_raw, get_rocrecvbufs_raw
     get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw)
     get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw)
-end
\ No newline at end of file
+end
+
+
+##----------------------------------------------
+## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
+
+function allocate_rocstreams(fields::GGField...)
+    allocate_rocstreams_iwrite(fields...);
+    allocate_rocstreams_iread(fields...);
+end
+
+let
+    global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite
+
+    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
+
+    function allocate_rocstreams_iwrite(fields::GGField...)
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
+            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            # DEBUG: the follow section needs perf testing
+            # DEBUG 2: commenting read_h2d_async! for now
+            # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = sendranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            # else
+            #     write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]);
+            # end
+        end
+    end
+end
+
+let
+    global iread_recvbufs!, allocate_rocstreams_iread, wait_iread
+
+    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
+
+    function allocate_rocstreams_iread(fields::GGField...)
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
+            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            # DEBUG: the follow section needs perf testing
+            # DEBUG 2: commenting read_h2d_async! for now
+            # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = recvranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            # else
+            #     read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]);
+            # end
+        end
+    end
+
+end
+
+
+# (AMDGPU functions)
+
+# Write to the send buffer on the host or device from the array on the device (d2x).
+function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1
+    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1
+    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1
+    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
+    return nothing
+end
+
+# Read from the receive buffer on the host or device and store on the array on the device (x2d).
+function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1
+    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1
+    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1
+    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
+    return nothing
+end
+
+# Write to the send buffer on the host from the array on the device (d2h).
+function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    buf_view = reshape(sendbuf, Tuple(length.(sendranges)))
+    AMDGPU.Mem.unsafe_copy3d!(
+        pointer(sendbuf), AMDGPU.Mem.HostBuffer,
+        pointer(A), typeof(A.buf),
+        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+        dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
+        srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2),
+        async=true, stream=rocstream
+    )
+    return nothing
+end
+
+# Read from the receive buffer on the host and store on the array on the device (h2d).
+function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    buf_view = reshape(recvbuf, Tuple(length.(recvranges)))
+    AMDGPU.Mem.unsafe_copy3d!(
+        pointer(A), typeof(A.buf),
+        pointer(recvbuf), AMDGPU.Mem.HostBuffer,
+        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+        dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2),
+        srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2),
+        async=true, stream=rocstream
+    )
+    return nothing
+end
+
+
+##------------------------------
+## FUNCTIONS TO SEND/RECV FIELDS
+
+function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
+    @inbounds AMDGPU.copyto!(dst, src)
+end
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
index 3124a85..bd58653 100644
--- a/src/CUDAExt/update_halo.jl
+++ b/src/CUDAExt/update_halo.jl
@@ -108,4 +108,127 @@ let
     global get_cusendbufs_raw, get_curecvbufs_raw
     get_cusendbufs_raw()  = deepcopy(cusendbufs_raw)
     get_curecvbufs_raw()  = deepcopy(curecvbufs_raw)
-end
\ No newline at end of file
+end
+
+
+##----------------------------------------------
+## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
+
+function allocate_custreams(fields::GGField...)
+    allocate_custreams_iwrite(fields...);
+    allocate_custreams_iread(fields...);
+end
+
+let
+    global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite
+
+    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
+
+    function allocate_custreams_iwrite(fields::GGField...)
+        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
+            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = sendranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            else
+                write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]);
+            end
+        end
+    end
+end
+
+let
+    global iread_recvbufs!, allocate_custreams_iread, wait_iread
+
+    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
+
+    function allocate_custreams_iread(fields::GGField...)
+        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
+            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            if dim == 1 || cudaaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = recvranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            else
+                read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]);
+            end
+        end
+    end
+end
+
+
+# (CUDA functions)
+
+# Write to the send buffer on the host or device from the array on the device (d2x).
+function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1
+    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1
+    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1
+    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
+    return nothing
+end
+
+# Read from the receive buffer on the host or device and store on the array on the device (x2d).
+function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1
+    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1
+    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1
+    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
+    return nothing
+end
+
+# Write to the send buffer on the host from the array on the device (d2h).
+function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
+    CUDA.Mem.unsafe_copy3d!(
+        pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device,
+        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+        srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2),
+        dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]),
+        async=true, stream=custream
+    )
+end
+
+# Read from the receive buffer on the host and store on the array on the device (h2d).
+function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
+    CUDA.Mem.unsafe_copy3d!(
+        pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host,
+        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+        srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]),
+        dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2),
+        async=true, stream=custream
+    )
+end
+
+
+##------------------------------
+## FUNCTIONS TO SEND/RECV FIELDS
+
+function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber
+    @inbounds CUDA.copyto!(dst, src)
+end
+
diff --git a/src/update_halo.jl b/src/update_halo.jl
index 2c38461..3413973 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -273,143 +273,6 @@ let
 end
 
 
-# (CUDA functions)
-
-function allocate_custreams(fields::GGField...)
-    allocate_custreams_iwrite(fields...);
-    allocate_custreams_iread(fields...);
-end
-
-let
-    global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite
-
-    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
-
-    wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
-
-    function allocate_custreams_iwrite(fields::GGField...)
-        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
-            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
-        end
-    end
-
-    function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
-        A, halowidths = F;
-        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-            if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = sendranges(n, dim, F);
-                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = [r[end] - r[1] + 1 for r in ranges];
-                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
-            else
-                write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]);
-            end
-        end
-    end
-end
-
-let
-    global iread_recvbufs!, allocate_custreams_iread, wait_iread
-
-    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
-
-    wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
-
-    function allocate_custreams_iread(fields::GGField...)
-        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
-            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
-        end
-    end
-
-    function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
-        A, halowidths = F;
-        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-            if dim == 1 || cudaaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = recvranges(n, dim, F);
-                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = [r[end] - r[1] + 1 for r in ranges];
-                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
-            else
-                read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]);
-            end
-        end
-    end
-end
-
-
-# (AMDGPU functions)
-
-function allocate_rocstreams(fields::GGField...)
-    allocate_rocstreams_iwrite(fields...);
-    allocate_rocstreams_iread(fields...);
-end
-
-let
-    global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite
-
-    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
-
-    wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
-
-    function allocate_rocstreams_iwrite(fields::GGField...)
-        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
-            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
-        end
-    end
-
-    function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
-        A, halowidths = F;
-        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-            # DEBUG: the follow section needs perf testing
-            # DEBUG 2: commenting read_h2d_async! for now
-            # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = sendranges(n, dim, F);
-                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = [r[end] - r[1] + 1 for r in ranges];
-                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
-            # else
-            #     write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]);
-            # end
-        end
-    end
-end
-
-let
-    global iread_recvbufs!, allocate_rocstreams_iread, wait_iread
-
-    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
-
-    wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
-
-    function allocate_rocstreams_iread(fields::GGField...)
-        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
-            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
-        end
-    end
-
-    function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
-        A, halowidths = F;
-        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-            # DEBUG: the follow section needs perf testing
-            # DEBUG 2: commenting read_h2d_async! for now
-            # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = recvranges(n, dim, F);
-                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = [r[end] - r[1] + 1 for r in ranges];
-                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
-            # else
-            #     read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]);
-            # end
-        end
-    end
-
-end
-
-
 # (CPU/GPU functions)
 
 # Return the ranges from A to be sent. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1).
@@ -472,105 +335,6 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni
 end
 
 
-# (CUDA functions)
-
-# Write to the send buffer on the host or device from the array on the device (d2x).
-function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
-    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1
-    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1
-    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1
-    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
-    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
-    return nothing
-end
-
-# Read from the receive buffer on the host or device and store on the array on the device (x2d).
-function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
-    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1
-    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1
-    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1
-    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
-    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
-    return nothing
-end
-
-# Write to the send buffer on the host from the array on the device (d2h).
-function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
-    CUDA.Mem.unsafe_copy3d!(
-        pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device,
-        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
-        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
-        srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2),
-        dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]),
-        async=true, stream=custream
-    )
-end
-
-# Read from the receive buffer on the host and store on the array on the device (h2d).
-function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
-    CUDA.Mem.unsafe_copy3d!(
-        pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host,
-        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
-        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
-        srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]),
-        dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2),
-        async=true, stream=custream
-    )
-end
-
-
-# (AMDGPU functions)
-
-# Write to the send buffer on the host or device from the array on the device (d2x).
-function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
-    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1
-    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1
-    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1
-    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
-    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
-    return nothing
-end
-
-# Read from the receive buffer on the host or device and store on the array on the device (x2d).
-function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
-    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1
-    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1
-    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1
-    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
-    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
-    return nothing
-end
-
-# Write to the send buffer on the host from the array on the device (d2h).
-function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
-    buf_view = reshape(sendbuf, Tuple(length.(sendranges)))
-    AMDGPU.Mem.unsafe_copy3d!(
-        pointer(sendbuf), AMDGPU.Mem.HostBuffer,
-        pointer(A), typeof(A.buf),
-        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
-        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
-        dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
-        srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2),
-        async=true, stream=rocstream
-    )
-    return nothing
-end
-
-# Read from the receive buffer on the host and store on the array on the device (h2d).
-function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
-    buf_view = reshape(recvbuf, Tuple(length.(recvranges)))
-    AMDGPU.Mem.unsafe_copy3d!(
-        pointer(A), typeof(A.buf),
-        pointer(recvbuf), AMDGPU.Mem.HostBuffer,
-        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
-        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
-        dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2),
-        srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2),
-        async=true, stream=rocstream
-    )
-    return nothing
-end
-
 ##------------------------------
 ## FUNCTIONS TO SEND/RECV FIELDS
 
@@ -643,20 +407,6 @@ function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T
 end
 
 
-# (CUDA functions)
-
-function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber
-    @inbounds CUDA.copyto!(dst, src)
-end
-
-
-# (AMDGPU functions)
-
-function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
-    @inbounds AMDGPU.copyto!(dst, src)
-end
-
-
 ##-------------------------------------------
 ## FUNCTIONS FOR CHECKING THE INPUT ARGUMENTS