diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl new file mode 100644 index 0000000..acbdea7 --- /dev/null +++ b/src/AMDGPUExt/update_halo.jl @@ -0,0 +1,99 @@ +##--------------------------------------- +## FUNCTIONS RELATED TO BUFFER ALLOCATION + +# NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time. + +let + global free_update_halo_rocbuffers, reset_roc_buffers, free_rocbufs + global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat + rocsendbufs_raw = nothing + rocrecvbufs_raw = nothing + # INFO: no need for roc host buffers + + function free_update_halo_rocbuffers() + free_rocbufs(rocsendbufs_raw) + free_rocbufs(rocrecvbufs_raw) + # INFO: no need for roc host buffers + reset_roc_buffers() + end + + function free_rocbufs(bufs) + if (bufs !== nothing) + for i = 1:length(bufs) + for n = 1:length(bufs[i]) + if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU + end + end + end + end + + # INFO: no need for roc host buffers + # function unregister_rocbufs(bufs) + # end + + function reset_roc_buffers() + rocsendbufs_raw = nothing + rocrecvbufs_raw = nothing + # INFO: no need for roc host buffers + end + + + # (AMDGPU functions) + + function init_rocbufs_arrays() + rocsendbufs_raw = Array{Array{Any,1},1}(); + rocrecvbufs_raw = Array{Array{Any,1},1}(); + # INFO: no need for roc host buffers + end + + function init_rocbufs(T::DataType, fields::GGField...) + while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end + while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end + # INFO: no need for roc host buffers + end + + function reinterpret_rocbufs(T::DataType, i::Integer, n::Integer) + if (eltype(rocsendbufs_raw[i][n]) != T) rocsendbufs_raw[i][n] = reinterpret(T, rocsendbufs_raw[i][n]); end + if (eltype(rocrecvbufs_raw[i][n]) != T) rocrecvbufs_raw[i][n] = reinterpret(T, rocrecvbufs_raw[i][n]); end + end + + function reallocate_rocbufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) + rocsendbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. + rocrecvbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); + end + + function reregister_rocbufs(T::DataType, i::Integer, n::Integer) + # INFO: no need for roc host buffers + rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]); + rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]); + end + + + # (AMDGPU functions) + + function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber + return view(rocsendbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); + end + + function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber + return view(rocrecvbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); + end + + + # (GPU functions) + + #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber and GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others. + function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T} where T <: GGNumber + return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T} where T <: GGNumber + return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + + # Make sendbufs_raw and recvbufs_raw accessible for unit testing. + global get_rocsendbufs_raw, get_rocrecvbufs_raw + get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw) + get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw) +end \ No newline at end of file diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl new file mode 100644 index 0000000..3124a85 --- /dev/null +++ b/src/CUDAExt/update_halo.jl @@ -0,0 +1,111 @@ +##--------------------------------------- +## FUNCTIONS RELATED TO BUFFER ALLOCATION + +# NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time. + +let + global free_update_halo_cubuffers, reset_cu_buffers, free_cubufs, unregister_cubufs + global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat + cusendbufs_raw = nothing + curecvbufs_raw = nothing + cusendbufs_raw_h = nothing + curecvbufs_raw_h = nothing + + function free_update_halo_cubuffers() + free_cubufs(cusendbufs_raw) + free_cubufs(curecvbufs_raw) + unregister_cubufs(cusendbufs_raw_h) + unregister_cubufs(curecvbufs_raw_h) + reset_cu_buffers() + end + + function free_cubufs(bufs) + if (bufs !== nothing) + for i = 1:length(bufs) + for n = 1:length(bufs[i]) + if is_cuarray(bufs[i][n]) CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end + end + end + end + end + + function unregister_cubufs(bufs) + if (bufs !== nothing) + for i = 1:length(bufs) + for n = 1:length(bufs[i]) + if (isa(bufs[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end + end + end + end + end + + function reset_cu_buffers() + cusendbufs_raw = nothing + curecvbufs_raw = nothing + cusendbufs_raw_h = nothing + curecvbufs_raw_h = nothing + end + + + # (CUDA functions) + + function init_cubufs_arrays() + cusendbufs_raw = Array{Array{Any,1},1}(); + curecvbufs_raw = Array{Array{Any,1},1}(); + cusendbufs_raw_h = Array{Array{Any,1},1}(); + curecvbufs_raw_h = Array{Array{Any,1},1}(); + end + + function init_cubufs(T::DataType, fields::GGField...) + while (length(cusendbufs_raw) < length(fields)) push!(cusendbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end + while (length(curecvbufs_raw) < length(fields)) push!(curecvbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end + while (length(cusendbufs_raw_h) < length(fields)) push!(cusendbufs_raw_h, [[], []]); end + while (length(curecvbufs_raw_h) < length(fields)) push!(curecvbufs_raw_h, [[], []]); end + end + + function reinterpret_cubufs(T::DataType, i::Integer, n::Integer) + if (eltype(cusendbufs_raw[i][n]) != T) cusendbufs_raw[i][n] = reinterpret(T, cusendbufs_raw[i][n]); end + if (eltype(curecvbufs_raw[i][n]) != T) curecvbufs_raw[i][n] = reinterpret(T, curecvbufs_raw[i][n]); end + end + + function reallocate_cubufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) + cusendbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. + curecvbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); + end + + function reregister_cubufs(T::DataType, i::Integer, n::Integer) + if (isa(cusendbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(cusendbufs_raw_h[i][n]); cusendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T)) + if (isa(curecvbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(curecvbufs_raw_h[i][n]); curecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T)) + cusendbufs_raw[i][n], cusendbufs_raw_h[i][n] = register(CuArray,sendbufs_raw[i][n]); + curecvbufs_raw[i][n], curecvbufs_raw_h[i][n] = register(CuArray,recvbufs_raw[i][n]); + end + + + # (CUDA functions) + + function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber + return view(cusendbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); + end + + function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber + return view(curecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); + end + + + # (GPU functions) + + #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber and GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others. + function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T} where T <: GGNumber + return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T} where T <: GGNumber + return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + + # Make sendbufs_raw and recvbufs_raw accessible for unit testing. + global get_cusendbufs_raw, get_curecvbufs_raw + get_cusendbufs_raw() = deepcopy(cusendbufs_raw) + get_curecvbufs_raw() = deepcopy(curecvbufs_raw) +end \ No newline at end of file diff --git a/src/update_halo.jl b/src/update_halo.jl index 4be9d87..2c38461 100644 --- a/src/update_halo.jl +++ b/src/update_halo.jl @@ -95,60 +95,25 @@ halosize(dim::Integer, A::GGField) = (dim==1) ? (A.halowidths[1], size(A,2), siz # NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time. let + #TODO: this was: global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat sendbufs_raw = nothing recvbufs_raw = nothing - cusendbufs_raw = nothing - curecvbufs_raw = nothing - cusendbufs_raw_h = nothing - curecvbufs_raw_h = nothing - rocsendbufs_raw = nothing - rocrecvbufs_raw = nothing - # INFO: no need for roc host buffers function free_update_halo_buffers() - if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(cusendbufs_raw) end - if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(curecvbufs_raw) end - if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(cusendbufs_raw_h) end - if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(curecvbufs_raw_h) end - if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocsendbufs_raw) end - if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocrecvbufs_raw) end - # INFO: no need to unregister roc host buffers - sendbufs_raw = nothing - recvbufs_raw = nothing - cusendbufs_raw = nothing - curecvbufs_raw = nothing - cusendbufs_raw_h = nothing - curecvbufs_raw_h = nothing - rocsendbufs_raw = nothing - rocrecvbufs_raw = nothing - # INFO: no need for roc host buffers - GC.gc() + free_update_halo_cpubuffers() + if (cuda_enabled() && none(cudaaware_MPI())) free_update_halo_cubuffers() end + if (amdgpu_enabled() && none(amdgpuaware_MPI())) free_update_halo_rocbuffers() end + GC.gc() #TODO: see how to modify this! end - - # (CUDA, AMDGPU functions) - - function free_gpubufs(bufs) - if (bufs !== nothing) - for i = 1:length(bufs) - for n = 1:length(bufs[i]) - if is_cuarray(bufs[i][n]) CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end - if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU - end - end - end + function free_update_halo_cpubuffers() + reset_cpu_buffers(); end - function unregister_gpubufs(bufs) - if (bufs !== nothing) - for i = 1:length(bufs) - for n = 1:length(bufs[i]) - if (isa(bufs[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end - # INFO: no need for roc host buffers - end - end - end + function reset_cpu_buffers() + sendbufs_raw = nothing + recvbufs_raw = nothing end # Allocate for each field two send and recv buffers (one for the left and one for the right neighbour of a dimension). The required length of the buffer is given by the maximal number of halo elements in any of the dimensions. Note that buffers are not allocated separately for each dimension, as the updates are performed one dimension at a time (required for correctness). @@ -215,71 +180,6 @@ let end - # (CUDA functions) - - function init_cubufs_arrays() - cusendbufs_raw = Array{Array{Any,1},1}(); - curecvbufs_raw = Array{Array{Any,1},1}(); - cusendbufs_raw_h = Array{Array{Any,1},1}(); - curecvbufs_raw_h = Array{Array{Any,1},1}(); - end - - function init_cubufs(T::DataType, fields::GGField...) - while (length(cusendbufs_raw) < length(fields)) push!(cusendbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end - while (length(curecvbufs_raw) < length(fields)) push!(curecvbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end - while (length(cusendbufs_raw_h) < length(fields)) push!(cusendbufs_raw_h, [[], []]); end - while (length(curecvbufs_raw_h) < length(fields)) push!(curecvbufs_raw_h, [[], []]); end - end - - function reinterpret_cubufs(T::DataType, i::Integer, n::Integer) - if (eltype(cusendbufs_raw[i][n]) != T) cusendbufs_raw[i][n] = reinterpret(T, cusendbufs_raw[i][n]); end - if (eltype(curecvbufs_raw[i][n]) != T) curecvbufs_raw[i][n] = reinterpret(T, curecvbufs_raw[i][n]); end - end - - function reallocate_cubufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) - cusendbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. - curecvbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); - end - - function reregister_cubufs(T::DataType, i::Integer, n::Integer) - if (isa(cusendbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(cusendbufs_raw_h[i][n]); cusendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T)) - if (isa(curecvbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(curecvbufs_raw_h[i][n]); curecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T)) - cusendbufs_raw[i][n], cusendbufs_raw_h[i][n] = register(CuArray,sendbufs_raw[i][n]); - curecvbufs_raw[i][n], curecvbufs_raw_h[i][n] = register(CuArray,recvbufs_raw[i][n]); - end - - - # (AMDGPU functions) - - function init_rocbufs_arrays() - rocsendbufs_raw = Array{Array{Any,1},1}(); - rocrecvbufs_raw = Array{Array{Any,1},1}(); - # INFO: no need for roc host buffers - end - - function init_rocbufs(T::DataType, fields::GGField...) - while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end - while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end - # INFO: no need for roc host buffers - end - - function reinterpret_rocbufs(T::DataType, i::Integer, n::Integer) - if (eltype(rocsendbufs_raw[i][n]) != T) rocsendbufs_raw[i][n] = reinterpret(T, rocsendbufs_raw[i][n]); end - if (eltype(rocrecvbufs_raw[i][n]) != T) rocrecvbufs_raw[i][n] = reinterpret(T, rocrecvbufs_raw[i][n]); end - end - - function reallocate_rocbufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) - rocsendbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. - rocrecvbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); - end - - function reregister_rocbufs(T::DataType, i::Integer, n::Integer) - # INFO: no need for roc host buffers - rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]); - rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]); - end - - # (CPU functions) function sendbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGField{T}) where T <: GGNumber @@ -298,49 +198,10 @@ let return reshape(recvbuf_flat(n,dim,i,A), halosize(dim,A)); end - - # (CUDA functions) - - function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber - return view(cusendbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); - end - - function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber - return view(curecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); - end - - - # (AMDGPU functions) - - function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber - return view(rocsendbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); - end - - function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber - return view(rocrecvbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); - end - - - # (GPU functions) - - #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber and GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others. - function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber - return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A)); - end - - function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber - return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A)); - end - - # Make sendbufs_raw and recvbufs_raw accessible for unit testing. - global get_sendbufs_raw, get_recvbufs_raw, get_cusendbufs_raw, get_curecvbufs_raw, get_rocsendbufs_raw, get_rocrecvbufs_raw + global get_sendbufs_raw, get_recvbufs_raw get_sendbufs_raw() = deepcopy(sendbufs_raw) get_recvbufs_raw() = deepcopy(recvbufs_raw) - get_cusendbufs_raw() = deepcopy(cusendbufs_raw) - get_curecvbufs_raw() = deepcopy(curecvbufs_raw) - get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw) - get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw) end