move code related to accessing buffers to extensions

eth-cscs · Jan 17, 2024 · 2c6e065 · 2c6e065
1 parent cc3767b
commit 2c6e065
Show file tree

Hide file tree

Showing 3 changed files with 258 additions and 252 deletions.
diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
@@ -96,4 +96,137 @@ let
     global get_rocsendbufs_raw, get_rocrecvbufs_raw
     get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw)
     get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw)
-end
+end
+
+
+##----------------------------------------------
+## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
+
+function allocate_rocstreams(fields::GGField...)
+    allocate_rocstreams_iwrite(fields...);
+    allocate_rocstreams_iread(fields...);
+end
+
+let
+    global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite
+
+    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
+
+    function allocate_rocstreams_iwrite(fields::GGField...)
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
+            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            # DEBUG: the follow section needs perf testing
+            # DEBUG 2: commenting read_h2d_async! for now
+            # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = sendranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            # else
+            #     write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]);
+            # end
+        end
+    end
+end
+
+let
+    global iread_recvbufs!, allocate_rocstreams_iread, wait_iread
+
+    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
+
+    function allocate_rocstreams_iread(fields::GGField...)
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
+            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            # DEBUG: the follow section needs perf testing
+            # DEBUG 2: commenting read_h2d_async! for now
+            # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = recvranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            # else
+            #     read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]);
+            # end
+        end
+    end
+
+end
+
+
+# (AMDGPU functions)
+
+# Write to the send buffer on the host or device from the array on the device (d2x).
+function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1
+    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1
+    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1
+    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
+    return nothing
+end
+
+# Read from the receive buffer on the host or device and store on the array on the device (x2d).
+function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1
+    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1
+    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1
+    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
+    return nothing
+end
+
+# Write to the send buffer on the host from the array on the device (d2h).
+function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    buf_view = reshape(sendbuf, Tuple(length.(sendranges)))
+    AMDGPU.Mem.unsafe_copy3d!(
+        pointer(sendbuf), AMDGPU.Mem.HostBuffer,
+        pointer(A), typeof(A.buf),
+        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+        dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
+        srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2),
+        async=true, stream=rocstream
+    )
+    return nothing
+end
+
+# Read from the receive buffer on the host and store on the array on the device (h2d).
+function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    buf_view = reshape(recvbuf, Tuple(length.(recvranges)))
+    AMDGPU.Mem.unsafe_copy3d!(
+        pointer(A), typeof(A.buf),
+        pointer(recvbuf), AMDGPU.Mem.HostBuffer,
+        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+        dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2),
+        srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2),
+        async=true, stream=rocstream
+    )
+    return nothing
+end
+
+
+##------------------------------
+## FUNCTIONS TO SEND/RECV FIELDS
+
+function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
+    @inbounds AMDGPU.copyto!(dst, src)
+end
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
@@ -108,4 +108,127 @@ let
     global get_cusendbufs_raw, get_curecvbufs_raw
     get_cusendbufs_raw()  = deepcopy(cusendbufs_raw)
     get_curecvbufs_raw()  = deepcopy(curecvbufs_raw)
-end
+end
+
+
+##----------------------------------------------
+## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
+
+function allocate_custreams(fields::GGField...)
+    allocate_custreams_iwrite(fields...);
+    allocate_custreams_iread(fields...);
+end
+
+let
+    global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite
+
+    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
+
+    function allocate_custreams_iwrite(fields::GGField...)
+        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
+            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = sendranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            else
+                write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]);
+            end
+        end
+    end
+end
+
+let
+    global iread_recvbufs!, allocate_custreams_iread, wait_iread
+
+    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
+
+    function allocate_custreams_iread(fields::GGField...)
+        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
+            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            if dim == 1 || cudaaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = recvranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            else
+                read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]);
+            end
+        end
+    end
+end
+
+
+# (CUDA functions)
+
+# Write to the send buffer on the host or device from the array on the device (d2x).
+function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1
+    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1
+    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1
+    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
+    return nothing
+end
+
+# Read from the receive buffer on the host or device and store on the array on the device (x2d).
+function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1
+    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1
+    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1
+    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
+    return nothing
+end
+
+# Write to the send buffer on the host from the array on the device (d2h).
+function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
+    CUDA.Mem.unsafe_copy3d!(
+        pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device,
+        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+        srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2),
+        dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]),
+        async=true, stream=custream
+    )
+end
+
+# Read from the receive buffer on the host and store on the array on the device (h2d).
+function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
+    CUDA.Mem.unsafe_copy3d!(
+        pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host,
+        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+        srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]),
+        dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2),
+        async=true, stream=custream
+    )
+end
+
+
+##------------------------------
+## FUNCTIONS TO SEND/RECV FIELDS
+
+function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber
+    @inbounds CUDA.copyto!(dst, src)
+end
+