From 2c6e065b25bb258fb67366430b0f966948c80914 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Wed, 17 Jan 2024 16:41:09 +0100 Subject: [PATCH] move code related to accessing buffers to extensions --- src/AMDGPUExt/update_halo.jl | 135 ++++++++++++++++++- src/CUDAExt/update_halo.jl | 125 +++++++++++++++++- src/update_halo.jl | 250 ----------------------------------- 3 files changed, 258 insertions(+), 252 deletions(-) diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl index acbdea7..381f5f4 100644 --- a/src/AMDGPUExt/update_halo.jl +++ b/src/AMDGPUExt/update_halo.jl @@ -96,4 +96,137 @@ let global get_rocsendbufs_raw, get_rocrecvbufs_raw get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw) get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw) -end \ No newline at end of file +end + + +##---------------------------------------------- +## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS + +function allocate_rocstreams(fields::GGField...) + allocate_rocstreams_iwrite(fields...); + allocate_rocstreams_iread(fields...); +end + +let + global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite + + rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); + + function allocate_rocstreams_iwrite(fields::GGField...) + if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField + rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + # DEBUG: the follow section needs perf testing + # DEBUG 2: commenting read_h2d_async! for now + # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = sendranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + # else + # write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]); + # end + end + end +end + +let + global iread_recvbufs!, allocate_rocstreams_iread, wait_iread + + rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); + + function allocate_rocstreams_iread(fields::GGField...) + if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField + rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + # DEBUG: the follow section needs perf testing + # DEBUG 2: commenting read_h2d_async! for now + # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = recvranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + # else + # read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]); + # end + end + end + +end + + +# (AMDGPU functions) + +# Write to the send buffer on the host or device from the array on the device (d2x). +function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1 + iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1 + iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1 + if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end + gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; + return nothing +end + +# Read from the receive buffer on the host or device and store on the array on the device (x2d). +function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1 + iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1 + iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1 + if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end + A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; + return nothing +end + +# Write to the send buffer on the host from the array on the device (d2h). +function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer + buf_view = reshape(sendbuf, Tuple(length.(sendranges))) + AMDGPU.Mem.unsafe_copy3d!( + pointer(sendbuf), AMDGPU.Mem.HostBuffer, + pointer(A), typeof(A.buf), + length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); + srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), + dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2), + srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2), + async=true, stream=rocstream + ) + return nothing +end + +# Read from the receive buffer on the host and store on the array on the device (h2d). +function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer + buf_view = reshape(recvbuf, Tuple(length.(recvranges))) + AMDGPU.Mem.unsafe_copy3d!( + pointer(A), typeof(A.buf), + pointer(recvbuf), AMDGPU.Mem.HostBuffer, + length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); + dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), + dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2), + srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2), + async=true, stream=rocstream + ) + return nothing +end + + +##------------------------------ +## FUNCTIONS TO SEND/RECV FIELDS + +function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber + @inbounds AMDGPU.copyto!(dst, src) +end diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl index 3124a85..bd58653 100644 --- a/src/CUDAExt/update_halo.jl +++ b/src/CUDAExt/update_halo.jl @@ -108,4 +108,127 @@ let global get_cusendbufs_raw, get_curecvbufs_raw get_cusendbufs_raw() = deepcopy(cusendbufs_raw) get_curecvbufs_raw() = deepcopy(curecvbufs_raw) -end \ No newline at end of file +end + + +##---------------------------------------------- +## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS + +function allocate_custreams(fields::GGField...) + allocate_custreams_iwrite(fields...); + allocate_custreams_iread(fields...); +end + +let + global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite + + custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); + + function allocate_custreams_iwrite(fields::GGField...) + if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField + custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = sendranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + else + write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]); + end + end + end +end + +let + global iread_recvbufs!, allocate_custreams_iread, wait_iread + + custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); + + function allocate_custreams_iread(fields::GGField...) + if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField + custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = recvranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + else + read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]); + end + end + end +end + + +# (CUDA functions) + +# Write to the send buffer on the host or device from the array on the device (d2x). +function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1 + iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1 + iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1 + if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end + gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; + return nothing +end + +# Read from the receive buffer on the host or device and store on the array on the device (x2d). +function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1 + iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1 + iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1 + if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end + A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; + return nothing +end + +# Write to the send buffer on the host from the array on the device (d2h). +function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer + CUDA.Mem.unsafe_copy3d!( + pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device, + length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); + srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), + srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2), + dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]), + async=true, stream=custream + ) +end + +# Read from the receive buffer on the host and store on the array on the device (h2d). +function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer + CUDA.Mem.unsafe_copy3d!( + pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host, + length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); + dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), + srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]), + dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2), + async=true, stream=custream + ) +end + + +##------------------------------ +## FUNCTIONS TO SEND/RECV FIELDS + +function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber + @inbounds CUDA.copyto!(dst, src) +end + diff --git a/src/update_halo.jl b/src/update_halo.jl index 2c38461..3413973 100644 --- a/src/update_halo.jl +++ b/src/update_halo.jl @@ -273,143 +273,6 @@ let end -# (CUDA functions) - -function allocate_custreams(fields::GGField...) - allocate_custreams_iwrite(fields...); - allocate_custreams_iread(fields...); -end - -let - global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite - - custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) - - wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); - - function allocate_custreams_iwrite(fields::GGField...) - if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField - custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. - end - end - - function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber - A, halowidths = F; - if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = sendranges(n, dim, F); - nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); - halosize = [r[end] - r[1] + 1 for r in ranges]; - nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); - else - write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]); - end - end - end -end - -let - global iread_recvbufs!, allocate_custreams_iread, wait_iread - - custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) - - wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); - - function allocate_custreams_iread(fields::GGField...) - if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField - custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. - end - end - - function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber - A, halowidths = F; - if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = recvranges(n, dim, F); - nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); - halosize = [r[end] - r[1] + 1 for r in ranges]; - nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); - else - read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]); - end - end - end -end - - -# (AMDGPU functions) - -function allocate_rocstreams(fields::GGField...) - allocate_rocstreams_iwrite(fields...); - allocate_rocstreams_iread(fields...); -end - -let - global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite - - rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) - - wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); - - function allocate_rocstreams_iwrite(fields::GGField...) - if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField - rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. - end - end - - function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber - A, halowidths = F; - if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - # DEBUG: the follow section needs perf testing - # DEBUG 2: commenting read_h2d_async! for now - # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = sendranges(n, dim, F); - nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); - halosize = [r[end] - r[1] + 1 for r in ranges]; - nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); - # else - # write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]); - # end - end - end -end - -let - global iread_recvbufs!, allocate_rocstreams_iread, wait_iread - - rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) - - wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); - - function allocate_rocstreams_iread(fields::GGField...) - if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField - rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. - end - end - - function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber - A, halowidths = F; - if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - # DEBUG: the follow section needs perf testing - # DEBUG 2: commenting read_h2d_async! for now - # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = recvranges(n, dim, F); - nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); - halosize = [r[end] - r[1] + 1 for r in ranges]; - nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); - # else - # read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]); - # end - end - end - -end - - # (CPU/GPU functions) # Return the ranges from A to be sent. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1). @@ -472,105 +335,6 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni end -# (CUDA functions) - -# Write to the send buffer on the host or device from the array on the device (d2x). -function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber - ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1 - iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1 - iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1 - if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end - gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; - return nothing -end - -# Read from the receive buffer on the host or device and store on the array on the device (x2d). -function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber - ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1 - iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1 - iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1 - if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end - A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; - return nothing -end - -# Write to the send buffer on the host from the array on the device (d2h). -function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer - CUDA.Mem.unsafe_copy3d!( - pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device, - length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); - srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), - srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2), - dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]), - async=true, stream=custream - ) -end - -# Read from the receive buffer on the host and store on the array on the device (h2d). -function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer - CUDA.Mem.unsafe_copy3d!( - pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host, - length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); - dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), - srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]), - dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2), - async=true, stream=custream - ) -end - - -# (AMDGPU functions) - -# Write to the send buffer on the host or device from the array on the device (d2x). -function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber - ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1 - iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1 - iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1 - if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end - gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; - return nothing -end - -# Read from the receive buffer on the host or device and store on the array on the device (x2d). -function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber - ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1 - iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1 - iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1 - if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end - A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; - return nothing -end - -# Write to the send buffer on the host from the array on the device (d2h). -function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer - buf_view = reshape(sendbuf, Tuple(length.(sendranges))) - AMDGPU.Mem.unsafe_copy3d!( - pointer(sendbuf), AMDGPU.Mem.HostBuffer, - pointer(A), typeof(A.buf), - length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); - srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), - dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2), - srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2), - async=true, stream=rocstream - ) - return nothing -end - -# Read from the receive buffer on the host and store on the array on the device (h2d). -function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer - buf_view = reshape(recvbuf, Tuple(length.(recvranges))) - AMDGPU.Mem.unsafe_copy3d!( - pointer(A), typeof(A.buf), - pointer(recvbuf), AMDGPU.Mem.HostBuffer, - length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); - dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), - dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2), - srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2), - async=true, stream=rocstream - ) - return nothing -end - ##------------------------------ ## FUNCTIONS TO SEND/RECV FIELDS @@ -643,20 +407,6 @@ function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T end -# (CUDA functions) - -function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber - @inbounds CUDA.copyto!(dst, src) -end - - -# (AMDGPU functions) - -function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber - @inbounds AMDGPU.copyto!(dst, src) -end - - ##------------------------------------------- ## FUNCTIONS FOR CHECKING THE INPUT ARGUMENTS