Skip to content

Commit

Permalink
move code related to accessing buffers to extensions
Browse files Browse the repository at this point in the history
  • Loading branch information
omlins committed Jan 17, 2024
1 parent cc3767b commit 2c6e065
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 252 deletions.
135 changes: 134 additions & 1 deletion src/AMDGPUExt/update_halo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,137 @@ let
global get_rocsendbufs_raw, get_rocrecvbufs_raw
get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw)
get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw)
end
end


##----------------------------------------------
## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS

function allocate_rocstreams(fields::GGField...)
allocate_rocstreams_iwrite(fields...);
allocate_rocstreams_iread(fields...);
end

let
global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite

rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)

wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);

function allocate_rocstreams_iwrite(fields::GGField...)
if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField
rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
end
end

function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
A, halowidths = F;
if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
# DEBUG: the follow section needs perf testing
# DEBUG 2: commenting read_h2d_async! for now
# if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
ranges = sendranges(n, dim, F);
nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
halosize = [r[end] - r[1] + 1 for r in ranges];
nblocks = Tuple(ceil.(Int, halosize./nthreads));
@roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
# else
# write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]);
# end
end
end
end

let
global iread_recvbufs!, allocate_rocstreams_iread, wait_iread

rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)

wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);

function allocate_rocstreams_iread(fields::GGField...)
if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField
rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
end
end

function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
A, halowidths = F;
if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
# DEBUG: the follow section needs perf testing
# DEBUG 2: commenting read_h2d_async! for now
# if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
ranges = recvranges(n, dim, F);
nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
halosize = [r[end] - r[1] + 1 for r in ranges];
nblocks = Tuple(ceil.(Int, halosize./nthreads));
@roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
# else
# read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]);
# end
end
end

end


# (AMDGPU functions)

# Write to the send buffer on the host or device from the array on the device (d2x).
function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1
iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1
iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1
if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
return nothing
end

# Read from the receive buffer on the host or device and store on the array on the device (x2d).
function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1
iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1
iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1
if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
return nothing
end

# Write to the send buffer on the host from the array on the device (d2h).
function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
buf_view = reshape(sendbuf, Tuple(length.(sendranges)))
AMDGPU.Mem.unsafe_copy3d!(
pointer(sendbuf), AMDGPU.Mem.HostBuffer,
pointer(A), typeof(A.buf),
length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2),
async=true, stream=rocstream
)
return nothing
end

# Read from the receive buffer on the host and store on the array on the device (h2d).
function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
buf_view = reshape(recvbuf, Tuple(length.(recvranges)))
AMDGPU.Mem.unsafe_copy3d!(
pointer(A), typeof(A.buf),
pointer(recvbuf), AMDGPU.Mem.HostBuffer,
length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2),
srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2),
async=true, stream=rocstream
)
return nothing
end


##------------------------------
## FUNCTIONS TO SEND/RECV FIELDS

function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
@inbounds AMDGPU.copyto!(dst, src)
end
125 changes: 124 additions & 1 deletion src/CUDAExt/update_halo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,127 @@ let
global get_cusendbufs_raw, get_curecvbufs_raw
get_cusendbufs_raw() = deepcopy(cusendbufs_raw)
get_curecvbufs_raw() = deepcopy(curecvbufs_raw)
end
end


##----------------------------------------------
## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS

function allocate_custreams(fields::GGField...)
allocate_custreams_iwrite(fields...);
allocate_custreams_iread(fields...);
end

let
global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite

custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)

wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);

function allocate_custreams_iwrite(fields::GGField...)
if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField
custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
end
end

function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
A, halowidths = F;
if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
ranges = sendranges(n, dim, F);
nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
halosize = [r[end] - r[1] + 1 for r in ranges];
nblocks = Tuple(ceil.(Int, halosize./nthreads));
@cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
else
write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]);
end
end
end
end

let
global iread_recvbufs!, allocate_custreams_iread, wait_iread

custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)

wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);

function allocate_custreams_iread(fields::GGField...)
if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField
custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
end
end

function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
A, halowidths = F;
if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
ranges = recvranges(n, dim, F);
nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
halosize = [r[end] - r[1] + 1 for r in ranges];
nblocks = Tuple(ceil.(Int, halosize./nthreads));
@cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
else
read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]);
end
end
end
end


# (CUDA functions)

# Write to the send buffer on the host or device from the array on the device (d2x).
function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1
iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1
iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1
if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
return nothing
end

# Read from the receive buffer on the host or device and store on the array on the device (x2d).
function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1
iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1
iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1
if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
return nothing
end

# Write to the send buffer on the host from the array on the device (d2h).
function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
CUDA.Mem.unsafe_copy3d!(
pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device,
length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2),
dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]),
async=true, stream=custream
)
end

# Read from the receive buffer on the host and store on the array on the device (h2d).
function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
CUDA.Mem.unsafe_copy3d!(
pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host,
length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]),
dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2),
async=true, stream=custream
)
end


##------------------------------
## FUNCTIONS TO SEND/RECV FIELDS

function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber
@inbounds CUDA.copyto!(dst, src)
end

Loading

0 comments on commit 2c6e065

Please sign in to comment.