diff --git a/Project.toml b/Project.toml index 495398f..58289eb 100644 --- a/Project.toml +++ b/Project.toml @@ -5,7 +5,7 @@ version = "0.13.0" [compat] AMDGPU = "0.5" -CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4" +CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4, 5" LoopVectorization = "0.12" MPI = "0.20" julia = "1.9" @@ -18,7 +18,8 @@ MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" [extras] CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" +MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] +test = ["Test", "MPIPreferences"] diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl index cc77591..de47075 100644 --- a/src/init_global_grid.jl +++ b/src/init_global_grid.jl @@ -12,7 +12,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de - {`periodx`|`periody`|`periodz`}`::Integer=0`: whether the grid is periodic (`1`) or not (`0`) in dimension {x|y|z}. - `quiet::Bool=false`: whether to suppress printing information like the size of the global grid (`true`) or not (`false`). !!! note "Advanced keyword arguments" - - {`overlapx`|`overlapy`|`overlapz`}`::Integer=2`: the number of elements adjacent local grids overlap in dimension {x|y|z}. By default (value `2`), an array `A` of size (`nx`, `ny`, `nz`) on process 1 (`A_1`) overlaps the corresponding array `A` on process 2 (`A_2`) by `2` indices if the two processes are adjacent. E.g., if `overlapx=2` and process 2 is the right neighbor of process 1 in dimension x, then `A_1[end-1:end,:,:]` overlaps `A_2[1:2,:,:]`. That means, after every call `update_halo!(A)`, we have `all(A_1[end-1:end,:,:] .== A_2[1:2,:,:])` (`A_1[end,:,:]` is the halo of process 1 and `A_2[1,:,:]` is the halo of process 2). The analog applies for the dimensions y and z. + - `overlaps::Tuple{Int,Int,Int}=(2,2,2)`: the number of elements adjacent local grids overlap in dimension x, y and z. By default (value `(2,2,2)`), an array `A` of size (`nx`, `ny`, `nz`) on process 1 (`A_1`) overlaps the corresponding array `A` on process 2 (`A_2`) by `2` indices if the two processes are adjacent. E.g., if `overlaps[1]=2` and process 2 is the right neighbor of process 1 in dimension x, then `A_1[end-1:end,:,:]` overlaps `A_2[1:2,:,:]`. That means, after every call `update_halo!(A)`, we have `all(A_1[end-1:end,:,:] .== A_2[1:2,:,:])` (`A_1[end,:,:]` is the halo of process 1 and `A_2[1,:,:]` is the halo of process 2). The analog applies for the dimensions y and z. + - `halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2)`: the default width of an array's halo in dimension x, y and z (must be greater than 1). The default can be overwritten per array in the function [`update_halo`](@ref). - `disp::Integer=1`: the displacement argument to `MPI.Cart_shift` in order to determine the neighbors. - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology. - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology. @@ -37,12 +38,13 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de See also: [`finalize_global_grid`](@ref), [`select_device`](@ref) """ -function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlapx::Integer=2, overlapy::Integer=2, overlapz::Integer=2, disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false) +function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false) if grid_is_initialized() error("The global grid has already been initialized.") end nxyz = [nx, ny, nz]; dims = [dimx, dimy, dimz]; periods = [periodx, periody, periodz]; - overlaps = [overlapx, overlapy, overlapz]; + overlaps = [overlaps...]; + halowidths = [halowidths...]; cuda_enabled = false amdgpu_enabled = false cudaaware_MPI = [false, false, false] @@ -70,10 +72,15 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_functional() end # NOTE: cuda could be enabled/disabled depending on some additional criteria. if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria. + if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end + if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end + if (any(periods .∉ ((0,1),))) error("Invalid arguments: periodx, periody, and periodz must be either 0 or 1."); end + if (any(halowidths .< 1)) error("Invalid arguments: halowidths cannot be less than 1."); end if (nx==1) error("Invalid arguments: nx can never be 1.") end if (ny==1 && nz>1) error("Invalid arguments: ny cannot be 1 if nz is greater than 1.") end if (any((nxyz .== 1) .& (dims .>1 ))) error("Incoherent arguments: if nx, ny, or nz is 1, then the corresponding dimx, dimy or dimz must not be set (or set 0 or 1)."); end - if (any((nxyz .< 2 .* overlaps .- 1) .& (periods .> 0))) error("Incoherent arguments: if nx, ny, or nz is smaller than 2*overlapx-1, 2*overlapy-1 or 2*overlapz-1, respectively, then the corresponding periodx, periody or periodz must not be set (or set 0)."); end + if (any((nxyz .< 2 .* overlaps .- 1) .& (periods .> 0))) error("Incoherent arguments: if nx, ny, or nz is smaller than 2*overlaps[1]-1, 2*overlaps[2]-1 or 2*overlaps[3]-1, respectively, then the corresponding periodx, periody or periodz must not be set (or set 0)."); end + if (any((overlaps .> 0) .& (halowidths .> overlaps.÷2))) error("Incoherent arguments: if overlap is greater than 0, then halowidth cannot be greater than overlap÷2, in each dimension."); end dims[(nxyz.==1).&(dims.==0)] .= 1; # Setting any of nxyz to 1, means that the corresponding dimension must also be 1 in the global grid. Thus, the corresponding dims entry must be 1. if (init_MPI) # NOTE: init MPI only, once the input arguments have been checked. if (MPI.Initialized()) error("MPI is already initialized. Set the argument 'init_MPI=false'."); end @@ -91,7 +98,7 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp); end nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2 - set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet)); + set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet)); if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]))"); end if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end init_timing_functions(); diff --git a/src/shared.jl b/src/shared.jl index 8455714..4a75457 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -38,9 +38,15 @@ const DEVICE_TYPE_AMDGPU = "AMDGPU" ##------ ## TYPES -const GGInt = Cint -const GGNumber = Number -const GGArray{T,N} = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}} +const GGInt = Cint +const GGNumber = Number +const GGArray{T,N} = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}} +const GGField{T,N,T_array} = NamedTuple{(:A, :halowidths), Tuple{T_array, Tuple{GGInt,GGInt,GGInt}}} where {T_array<:GGArray{T,N}} +const GGFieldConvertible{T,N,T_array} = NamedTuple{(:A, :halowidths), <:Tuple{T_array, Tuple{T2,T2,T2}}} where {T_array<:GGArray{T,N}, T2<:Integer} +const GGField{}(t::NamedTuple) = GGField{eltype(t.A),ndims(t.A),typeof(t.A)}((t.A, GGInt.(t.halowidths))) +const CPUField{T,N} = GGField{T,N,Array{T,N}} +const CuField{T,N} = GGField{T,N,CuArray{T,N}} +const ROCField{T,N} = GGField{T,N,ROCArray{T,N}} "An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests) struct GlobalGrid @@ -48,6 +54,7 @@ struct GlobalGrid nxyz::Vector{GGInt} dims::Vector{GGInt} overlaps::Vector{GGInt} + halowidths::Vector{GGInt} nprocs::GGInt me::GGInt coords::Vector{GGInt} @@ -63,7 +70,7 @@ struct GlobalGrid loopvectorization::Vector{Bool} quiet::Bool end -const GLOBAL_GRID_NULL = GlobalGrid(GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], -1, -1, GGInt[-1,-1,-1], GGInt[-1 -1 -1; -1 -1 -1], GGInt[-1,-1,-1], -1, -1, MPI.COMM_NULL, false, false, [false,false,false], [false,false,false], [false,false,false], false) +const GLOBAL_GRID_NULL = GlobalGrid(GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], -1, -1, GGInt[-1,-1,-1], GGInt[-1 -1 -1; -1 -1 -1], GGInt[-1,-1,-1], -1, -1, MPI.COMM_NULL, false, false, [false,false,false], [false,false,false], [false,false,false], false) # Macro to switch on/off check_initialized() for performance reasons (potentially relevant for tools.jl). macro check_initialized() :(check_initialized();) end #FIXME: Alternative: macro check_initialized() end @@ -92,6 +99,8 @@ me() = global_grid().me comm() = global_grid().comm ol(dim::Integer) = global_grid().overlaps[dim] ol(dim::Integer, A::GGArray) = global_grid().overlaps[dim] + (size(A,dim) - global_grid().nxyz[dim]) +ol(A::GGArray) = (ol(dim, A) for dim in 1:ndims(A)) +hw_default() = global_grid().halowidths neighbors(dim::Integer) = global_grid().neighbors[:,dim] neighbor(n::Integer, dim::Integer) = global_grid().neighbors[n,dim] cuda_enabled() = global_grid().cuda_enabled @@ -103,14 +112,32 @@ amdgpuaware_MPI(dim::Integer) = global_grid().amdgpuaware_MPI[dim] loopvectorization() = global_grid().loopvectorization loopvectorization(dim::Integer) = global_grid().loopvectorization[dim] has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.PROC_NULL -any_array(fields::GGArray...) = any([is_array(A) for A in fields]) -any_cuarray(fields::GGArray...) = any([is_cuarray(A) for A in fields]) -any_rocarray(fields::GGArray...) = any([is_rocarray(A) for A in fields]) +any_array(fields::GGField...) = any([is_array(A.A) for A in fields]) +any_cuarray(fields::GGField...) = any([is_cuarray(A.A) for A in fields]) +any_rocarray(fields::GGField...) = any([is_rocarray(A.A) for A in fields]) is_array(A::GGArray) = typeof(A) <: Array is_cuarray(A::GGArray) = typeof(A) <: CuArray #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer. is_rocarray(A::GGArray) = typeof(A) <: ROCArray #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer. +##-------------------------------------------------------------------------------- +## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS + +wrap_field(A::GGField) = A +wrap_field(A::GGFieldConvertible) = GGField(A) +wrap_field(A::Array, hw::Tuple) = CPUField{eltype(A), ndims(A)}((A, hw)) +wrap_field(A::CuArray, hw::Tuple) = CuField{eltype(A), ndims(A)}((A, hw)) +wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw)) +wrap_field(A::GGArray, hw::Integer...) = wrap_field(A, hw) +wrap_field(A::GGArray) = wrap_field(A, hw_default()...) + +Base.size(A::Union{GGField, CPUField, CuField, ROCField}) = Base.size(A.A) +Base.size(A::Union{GGField, CPUField, CuField, ROCField}, args...) = Base.size(A.A, args...) +Base.length(A::Union{GGField, CPUField, CuField, ROCField}) = Base.length(A.A) +Base.ndims(A::Union{GGField, CPUField, CuField, ROCField}) = Base.ndims(A.A) +Base.eltype(A::Union{GGField, CPUField, CuField, ROCField}) = Base.eltype(A.A) + + ##--------------- ## CUDA functions diff --git a/src/update_halo.jl b/src/update_halo.jl index ad25e7f..189fe9c 100644 --- a/src/update_halo.jl +++ b/src/update_halo.jl @@ -4,11 +4,15 @@ export update_halo! update_halo!(A) update_halo!(A...) +!!! note "Advanced" + update_halo!(A, B, (A=C, halowidths=..., (A=D, halowidths=...), ...) + Update the halo of the given GPU/CPU-array(s). # Typical use cases: - update_halo!(A) # Update the halo of the array A. - update_halo!(A, B, C) # Update the halos of the arrays A, B and C. + update_halo!(A) # Update the halo of the array A. + update_halo!(A, B, C) # Update the halos of the arrays A, B and C. + update_halo!(A, B, (A=C, halowidths=(2,2,2))) # Update the halos of the arrays A, B, C, defining non default halowidth for C. !!! note "Performance note" Group subsequent calls to `update_halo!` in a single call for better performance (enables additional pipelining). @@ -22,14 +26,15 @@ Update the halo of the given GPU/CPU-array(s). shell> export IGG_ROCMAWARE_MPI=1 ``` """ -function update_halo!(A::GGArray...) +function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...) check_initialized(); - check_fields(A...); - _update_halo!(A...); # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user). + fields = wrap_field.(A); + check_fields(fields...); + _update_halo!(fields...); # Assignment of A to fields in the internal function _update_halo!() as vararg A can consist of multiple fields; A will be used for a single field in the following (The args of update_halo! must however be "A..." for maximal simplicity and elegance for the user). return nothing end -function _update_halo!(fields::GGArray...) +function _update_halo!(fields::GGField...) if (any_cuarray(fields...) && !cuda_enabled()) error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`. if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`. allocate_bufs(fields...); @@ -81,7 +86,7 @@ end ##--------------------------- ## FUNCTIONS FOR SYNTAX SUGAR -halosize(dim::Integer, A::GGArray) = (ndims(A)>1) ? size(A)[1:ndims(A).!=dim] : (1,); +halosize(dim::Integer, A::GGField) = (dim==1) ? (A.halowidths[1], size(A,2), size(A,3)) : ((dim==2) ? (size(A,1), A.halowidths[2], size(A,3)) : (size(A,1), size(A,2), A.halowidths[3])) ##--------------------------------------- @@ -147,7 +152,7 @@ let end # Allocate for each field two send and recv buffers (one for the left and one for the right neighbour of a dimension). The required length of the buffer is given by the maximal number of halo elements in any of the dimensions. Note that buffers are not allocated separately for each dimension, as the updates are performed one dimension at a time (required for correctness). - function allocate_bufs(fields::GGArray{T}...) where T <: GGNumber + function allocate_bufs(fields::GGField{T}...) where T <: GGNumber if (isnothing(sendbufs_raw) || isnothing(recvbufs_raw)) free_update_halo_buffers(); init_bufs_arrays(); @@ -158,13 +163,13 @@ let if cuda_enabled() init_cubufs(T, fields...); end if amdgpu_enabled() init_rocbufs(T, fields...); end for i = 1:length(fields) - A = fields[i]; + A, halowidths = fields[i]; for n = 1:NNEIGHBORS_PER_DIM # Ensure that the buffers are interpreted to contain elements of the same type as the array. reinterpret_bufs(T, i, n); if cuda_enabled() reinterpret_cubufs(T, i, n); end if amdgpu_enabled() reinterpret_rocbufs(T, i, n); end end - max_halo_elems = (ndims(A) > 1) ? prod(sort([size(A)...])[2:end]) : 1; + max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1])); if (length(sendbufs_raw[i][1]) < max_halo_elems) for n = 1:NNEIGHBORS_PER_DIM reallocate_bufs(T, i, n, max_halo_elems); @@ -194,7 +199,7 @@ let recvbufs_raw = Array{Array{Any,1},1}(); end - function init_bufs(T::DataType, fields::GGArray...) + function init_bufs(T::DataType, fields::GGField...) while (length(sendbufs_raw) < length(fields)) push!(sendbufs_raw, [zeros(T,0), zeros(T,0)]); end while (length(recvbufs_raw) < length(fields)) push!(recvbufs_raw, [zeros(T,0), zeros(T,0)]); end end @@ -219,7 +224,7 @@ let curecvbufs_raw_h = Array{Array{Any,1},1}(); end - function init_cubufs(T::DataType, fields::GGArray...) + function init_cubufs(T::DataType, fields::GGField...) while (length(cusendbufs_raw) < length(fields)) push!(cusendbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end while (length(curecvbufs_raw) < length(fields)) push!(curecvbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end while (length(cusendbufs_raw_h) < length(fields)) push!(cusendbufs_raw_h, [[], []]); end @@ -252,7 +257,7 @@ let # INFO: no need for roc host buffers end - function init_rocbufs(T::DataType, fields::GGArray...) + function init_rocbufs(T::DataType, fields::GGField...) while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end # INFO: no need for roc host buffers @@ -277,41 +282,41 @@ let # (CPU functions) - function sendbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGArray{T}) where T <: GGNumber + function sendbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGField{T}) where T <: GGNumber return view(sendbufs_raw[i][n]::AbstractVector{T},1:prod(halosize(dim,A))); end - function recvbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGArray{T}) where T <: GGNumber + function recvbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGField{T}) where T <: GGNumber return view(recvbufs_raw[i][n]::AbstractVector{T},1:prod(halosize(dim,A))); end - function sendbuf(n::Integer, dim::Integer, i::Integer, A::GGArray) + function sendbuf(n::Integer, dim::Integer, i::Integer, A::GGField) return reshape(sendbuf_flat(n,dim,i,A), halosize(dim,A)); end - function recvbuf(n::Integer, dim::Integer, i::Integer, A::GGArray) + function recvbuf(n::Integer, dim::Integer, i::Integer, A::GGField) return reshape(recvbuf_flat(n,dim,i,A), halosize(dim,A)); end # (CUDA functions) - function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuArray{T}) where T <: GGNumber + function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber return view(cusendbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); end - function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuArray{T}) where T <: GGNumber + function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber return view(curecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); end # (AMDGPU functions) - function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCArray{T}) where T <: GGNumber + function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber return view(rocsendbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); end - function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCArray{T}) where T <: GGNumber + function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber return view(rocrecvbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); end @@ -319,11 +324,11 @@ let # (GPU functions) #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber and GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others. - function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuArray{T}, ROCArray{T}}) where T <: GGNumber + function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A)); end - function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuArray{T}, ROCArray{T}}) where T <: GGNumber + function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A)); end @@ -347,7 +352,7 @@ end # (CPU functions) -function allocate_tasks(fields::GGArray...) +function allocate_tasks(fields::GGField...) allocate_tasks_iwrite(fields...); allocate_tasks_iread(fields...); end @@ -357,18 +362,19 @@ let tasks = Array{Task}(undef, NNEIGHBORS_PER_DIM, 0); - wait_iwrite(n::Integer, A::Array{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) # The argument A is used for multiple dispatch. #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance as the tasks are mapped only to one thread via context switching). + wait_iwrite(n::Integer, A::CPUField{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) # The argument A is used for multiple dispatch. #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance as the tasks are mapped only to one thread via context switching). - function allocate_tasks_iwrite(fields::GGArray...) - if length(fields) > size(tasks,2) # Note: for simplicity, we create a tasks for every field even if it is not an Array + function allocate_tasks_iwrite(fields::GGField...) + if length(fields) > size(tasks,2) # Note: for simplicity, we create a tasks for every field even if it is not an CPUField tasks = [tasks Array{Task}(undef, NNEIGHBORS_PER_DIM, length(fields)-size(tasks,2))]; # Create (additional) emtpy tasks. - end + end end - function iwrite_sendbufs!(n::Integer, dim::Integer, A::Array{T}, i::Integer) where T <: GGNumber # Function to be called if A is a CPU array. + function iwrite_sendbufs!(n::Integer, dim::Integer, F::CPUField{T}, i::Integer) where T <: GGNumber # Function to be called if A is a CPUField. + A, halowidths = F; tasks[n,i] = @task begin - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... - write_h2h!(sendbuf(n,dim,i,A), A, sendranges(n,dim,A), dim); + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + write_h2h!(sendbuf(n,dim,i,F), A, sendranges(n,dim,F), dim); end end end @@ -383,18 +389,19 @@ let tasks = Array{Task}(undef, NNEIGHBORS_PER_DIM, 0); - wait_iread(n::Integer, A::Array{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance currently as the tasks are mapped only to one thread via context switching). + wait_iread(n::Integer, A::CPUField{T}, i::Integer) where T <: GGNumber = (schedule(tasks[n,i]); wait(tasks[n,i]);) #NOTE: The current implementation only starts a task when it is waited for, in order to make sure that only one task is run at a time and that they are run in the desired order (best for performance currently as the tasks are mapped only to one thread via context switching). - function allocate_tasks_iread(fields::GGArray...) + function allocate_tasks_iread(fields::GGField...) if length(fields) > size(tasks,2) # Note: for simplicity, we create a tasks for every field even if it is not an Array tasks = [tasks Array{Task}(undef, NNEIGHBORS_PER_DIM, length(fields)-size(tasks,2))]; # Create (additional) emtpy tasks. end end - function iread_recvbufs!(n::Integer, dim::Integer, A::Array{T}, i::Integer) where T <: GGNumber + function iread_recvbufs!(n::Integer, dim::Integer, F::CPUField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; tasks[n,i] = @task begin - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... - read_h2h!(recvbuf(n,dim,i,A), A, recvranges(n,dim,A), dim); + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + read_h2h!(recvbuf(n,dim,i,F), A, recvranges(n,dim,F), dim); end end end @@ -407,7 +414,7 @@ end # (CUDA functions) -function allocate_custreams(fields::GGArray...) +function allocate_custreams(fields::GGField...) allocate_custreams_iwrite(fields...); allocate_custreams_iread(fields...); end @@ -417,24 +424,25 @@ let custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) - wait_iwrite(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); + wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); - function allocate_custreams_iwrite(fields::GGArray...) - if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuArray + function allocate_custreams_iwrite(fields::GGField...) + if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. end end - function iwrite_sendbufs!(n::Integer, dim::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... + function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = sendranges(n, dim, A); + ranges = sendranges(n, dim, F); nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); halosize = [r[end] - r[1] + 1 for r in ranges]; nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim); + @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); else - write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), custreams[n,i]); + write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]); end end end @@ -445,24 +453,25 @@ let custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) - wait_iread(n::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); + wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); - function allocate_custreams_iread(fields::GGArray...) - if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuArray + function allocate_custreams_iread(fields::GGField...) + if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. end end - function iread_recvbufs!(n::Integer, dim::Integer, A::CuArray{T}, i::Integer) where T <: GGNumber - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... + function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = recvranges(n, dim, A); + ranges = recvranges(n, dim, F); nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); halosize = [r[end] - r[1] + 1 for r in ranges]; nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim); + @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); else - read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), custreams[n,i]); + read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]); end end end @@ -471,7 +480,7 @@ end # (AMDGPU functions) -function allocate_rocstreams(fields::GGArray...) +function allocate_rocstreams(fields::GGField...) allocate_rocstreams_iwrite(fields...); allocate_rocstreams_iread(fields...); end @@ -481,26 +490,27 @@ let rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) - wait_iwrite(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); + wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); - function allocate_rocstreams_iwrite(fields::GGArray...) - if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCArray + function allocate_rocstreams_iwrite(fields::GGField...) + if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. end end - function iwrite_sendbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... + function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... # DEBUG: the follow section needs perf testing # DEBUG 2: commenting read_h2d_async! for now # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = sendranges(n, dim, A); + ranges = sendranges(n, dim, F); nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); halosize = [r[end] - r[1] + 1 for r in ranges]; nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim); + @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); # else - # write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]); + # write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]); # end end end @@ -511,26 +521,27 @@ let rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) - wait_iread(n::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); + wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); - function allocate_rocstreams_iread(fields::GGArray...) - if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCArray + function allocate_rocstreams_iread(fields::GGField...) + if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. end end - function iread_recvbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... + function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... # DEBUG: the follow section needs perf testing # DEBUG 2: commenting read_h2d_async! for now # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = recvranges(n, dim, A); + ranges = recvranges(n, dim, F); nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); halosize = [r[end] - r[1] + 1 for r in ranges]; nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim); + @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); # else - # read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]); + # read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]); # end end end @@ -541,24 +552,26 @@ end # (CPU/GPU functions) # Return the ranges from A to be sent. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1). -function sendranges(n::Integer, dim::Integer, A::GGArray) - if (ol(dim, A) < 2) error("Incoherent arguments: ol(A,dim)<2."); end +function sendranges(n::Integer, dim::Integer, F::GGField) + A, halowidths = F; + if (ol(dim, A) < 2*halowidths[dim]) error("Incoherent arguments: ol(A,dim)<2*halowidths[dim]."); end if (n==2) ixyz_dim = size(A, dim) - (ol(dim, A) - 1); - elseif (n==1) ixyz_dim = 1 + (ol(dim, A) - 1); + elseif (n==1) ixyz_dim = 1 + (ol(dim, A) - halowidths[dim]); end sendranges = [1:size(A,1), 1:size(A,2), 1:size(A,3)]; # Initialize with the ranges of A. - sendranges[dim] = ixyz_dim:ixyz_dim; + sendranges[dim] = ixyz_dim:ixyz_dim+halowidths[dim]-1; return sendranges end # Return the ranges from A to be received. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1). -function recvranges(n::Integer, dim::Integer, A::GGArray) - if (ol(dim, A) < 2) error("Incoherent arguments: ol(A,dim)<2."); end - if (n==2) ixyz_dim = size(A, dim); +function recvranges(n::Integer, dim::Integer, F::GGField) + A, halowidths = F; + if (ol(dim, A) < 2*halowidths[dim]) error("Incoherent arguments: ol(A,dim)<2*halowidths[dim]."); end + if (n==2) ixyz_dim = size(A, dim) - (halowidths[dim] - 1); elseif (n==1) ixyz_dim = 1; end recvranges = [1:size(A,1), 1:size(A,2), 1:size(A,3)]; # Initialize with the ranges of A. - recvranges[dim] = ixyz_dim:ixyz_dim; + recvranges[dim] = ixyz_dim:ixyz_dim+halowidths[dim]-1; return recvranges end @@ -604,10 +617,7 @@ function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrange iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1 iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1 if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end - if (dim == 1) gpusendbuf[iy,iz] = A[ix,iy,iz]; - elseif (dim == 2) gpusendbuf[ix,iz] = A[ix,iy,iz]; - elseif (dim == 3) gpusendbuf[ix,iy] = A[ix,iy,iz]; - end + gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; return nothing end @@ -617,10 +627,7 @@ function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1 iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1 if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end - if (dim == 1) A[ix,iy,iz] = gpurecvbuf[iy,iz]; - elseif (dim == 2) A[ix,iy,iz] = gpurecvbuf[ix,iz]; - elseif (dim == 3) A[ix,iy,iz] = gpurecvbuf[ix,iy]; - end + A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; return nothing end @@ -657,10 +664,7 @@ function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendran iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1 iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1 if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end - if (dim == 1) gpusendbuf[iy,iz] = A[ix,iy,iz]; - elseif (dim == 2) gpusendbuf[ix,iz] = A[ix,iy,iz]; - elseif (dim == 3) gpusendbuf[ix,iy] = A[ix,iy,iz]; - end + gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; return nothing end @@ -670,10 +674,7 @@ function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrang iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1 iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1 if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end - if (dim == 1) A[ix,iy,iz] = gpurecvbuf[iy,iz]; - elseif (dim == 2) A[ix,iy,iz] = gpurecvbuf[ix,iz]; - elseif (dim == 3) A[ix,iy,iz] = gpurecvbuf[ix,iy]; - end + A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; return nothing end @@ -710,43 +711,46 @@ end ##------------------------------ ## FUNCTIONS TO SEND/RECV FIELDS -function irecv_halo!(n::Integer, dim::Integer, A::GGArray, i::Integer; tag::Integer=0) +function irecv_halo!(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Integer=0) req = MPI.REQUEST_NULL; - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A)) - req = MPI.Irecv!(gpurecvbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm()); + req = MPI.Irecv!(gpurecvbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm()); else - req = MPI.Irecv!(recvbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm()); + req = MPI.Irecv!(recvbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm()); end end return req end -function isend_halo(n::Integer, dim::Integer, A::GGArray, i::Integer; tag::Integer=0) +function isend_halo(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Integer=0) req = MPI.REQUEST_NULL; - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A)) - req = MPI.Isend(gpusendbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm()); + req = MPI.Isend(gpusendbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm()); else - req = MPI.Isend(sendbuf_flat(n,dim,i,A), neighbor(n,dim), tag, comm()); + req = MPI.Isend(sendbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm()); end end return req end -function sendrecv_halo_local(n::Integer, dim::Integer, A::GGArray, i::Integer) - if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2... +function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer) + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A)) if n == 1 - gpumemcopy!(gpurecvbuf_flat(2,dim,i,A), gpusendbuf_flat(1,dim,i,A)); + gpumemcopy!(gpurecvbuf_flat(2,dim,i,F), gpusendbuf_flat(1,dim,i,F)); elseif n == 2 - gpumemcopy!(gpurecvbuf_flat(1,dim,i,A), gpusendbuf_flat(2,dim,i,A)); + gpumemcopy!(gpurecvbuf_flat(1,dim,i,F), gpusendbuf_flat(2,dim,i,F)); end else if n == 1 - memcopy!(recvbuf_flat(2,dim,i,A), sendbuf_flat(1,dim,i,A), loopvectorization(dim)); + memcopy!(recvbuf_flat(2,dim,i,F), sendbuf_flat(1,dim,i,F), loopvectorization(dim)); elseif n == 2 - memcopy!(recvbuf_flat(1,dim,i,A), sendbuf_flat(2,dim,i,A), loopvectorization(dim)); + memcopy!(recvbuf_flat(1,dim,i,F), sendbuf_flat(2,dim,i,F), loopvectorization(dim)); end end end @@ -801,12 +805,21 @@ end ##------------------------------------------- ## FUNCTIONS FOR CHECKING THE INPUT ARGUMENTS -function check_fields(fields::GGArray...) - # Raise an error if any of the given fields does not have a halo. +# NOTE: no comparison must be done between the field-local halowidths and field-local overlaps because any combination is valid: the rational is that a field has simply no halo but only computation overlap in a given dimension if the corresponding local overlap is less than 2 times the local halowidth. This allows to determine whether a halo update needs to be done in a certain dimension or not. +function check_fields(fields::GGField...) + # Raise an error if any of the given fields has a halowidth less than 1. + invalid_halowidths = [i for i=1:length(fields) if any([fields[i].halowidths[dim]<1 for dim=1:ndims(fields[i])])]; + if length(invalid_halowidths) > 1 + error("The fields at positions $(join(invalid_halowidths,", "," and ")) have a halowidth less than 1.") + elseif length(invalid_halowidths) > 0 + error("The field at position $(invalid_halowidths[1]) has a halowidth less than 1.") + end + + # Raise an error if any of the given fields has no halo at all (in any dimension) - in this case there is no halo update to do and including the field in the call is inconsistent. no_halo = Int[]; for i = 1:length(fields) - A = fields[i]; - if all([ol(dim, A) < 2 for dim = 1:ndims(A)]) # There is no halo if the overlap is less than 2... + A, halowidths = fields[i] + if all([(ol(dim, A) < 2*halowidths[dim]) for dim = 1:ndims(A)]) # There is no halo if the overlap is less than 2 times the halowidth (only computation overlap in this case)... push!(no_halo, i); end end @@ -817,7 +830,7 @@ function check_fields(fields::GGArray...) end # Raise an error if any of the given fields contains any duplicates. - duplicates = [[i,j] for i=1:length(fields) for j=i+1:length(fields) if fields[i]===fields[j]]; + duplicates = [[i,j] for i=1:length(fields) for j=i+1:length(fields) if fields[i].A===fields[j].A]; if length(duplicates) > 2 error("The pairs of fields with the positions $(join(duplicates,", "," and ")) are the same; remove any duplicates from the call.") elseif length(duplicates) > 0 @@ -825,7 +838,7 @@ function check_fields(fields::GGArray...) end # Raise an error if not all fields are of the same datatype (restriction comes from buffer handling). - different_types = [i for i=2:length(fields) if typeof(fields[i])!=typeof(fields[1])]; + different_types = [i for i=2:length(fields) if typeof(fields[i].A)!=typeof(fields[1].A)]; if length(different_types) > 1 error("The fields at positions $(join(different_types,", "," and ")) are of different type than the first field; make sure that in a same call all fields are of the same type.") elseif length(different_types) == 1 diff --git a/test/test_gather.jl b/test/test_gather.jl index 2a68b46..42cc4af 100644 --- a/test/test_gather.jl +++ b/test/test_gather.jl @@ -35,7 +35,7 @@ dz = 1.0 @testset "2. gather!" begin @testset "1D" begin - me, dims = init_global_grid(nx, 1, 1, overlapx=0, quiet=true, init_MPI=false); + me, dims = init_global_grid(nx, 1, 1, overlaps=(0,0,0), quiet=true, init_MPI=false); P = zeros(nx); P_g = zeros(nx*dims[1]); P .= [x_g(ix,dx,P) for ix=1:size(P,1)]; @@ -46,7 +46,7 @@ dz = 1.0 finalize_global_grid(finalize_MPI=false); end; @testset "2D" begin - me, dims = init_global_grid(nx, ny, 1, overlapx=0, overlapy=0, quiet=true, init_MPI=false); + me, dims = init_global_grid(nx, ny, 1, overlaps=(0,0,0), quiet=true, init_MPI=false); P = zeros(nx, ny); P_g = zeros(nx*dims[1], ny*dims[2]); P .= [y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2)]; @@ -57,7 +57,7 @@ dz = 1.0 finalize_global_grid(finalize_MPI=false); end; @testset "3D" begin - me, dims = init_global_grid(nx, ny, nz, overlapx=0, overlapy=0, overlapz=0, quiet=true, init_MPI=false); + me, dims = init_global_grid(nx, ny, nz, overlaps=(0,0,0), quiet=true, init_MPI=false); P = zeros(nx, ny, nz); P_g = zeros(nx*dims[1], ny*dims[2], nz*dims[3]); P .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]; @@ -68,7 +68,7 @@ dz = 1.0 finalize_global_grid(finalize_MPI=false); end; @testset "1D, then larger 3D, then smaller 2D" begin - me, dims = init_global_grid(nx, ny, nz, overlapx=0, overlapy=0, overlapz=0, quiet=true, init_MPI=false); + me, dims = init_global_grid(nx, ny, nz, overlaps=(0,0,0), quiet=true, init_MPI=false); # (1D) P = zeros(nx); P_g = zeros(nx*dims[1], dims[2], dims[3]); @@ -96,7 +96,7 @@ dz = 1.0 finalize_global_grid(finalize_MPI=false); end; @testset "Float32, then Float64, then Int16" begin - me, dims = init_global_grid(nx, ny, nz, overlapx=0, overlapy=0, overlapz=0, quiet=true, init_MPI=false); + me, dims = init_global_grid(nx, ny, nz, overlaps=(0,0,0), quiet=true, init_MPI=false); # Float32 (1D) P = zeros(Float32, nx); P_g = zeros(Float32, nx*dims[1], dims[2], dims[3]); @@ -136,7 +136,7 @@ dz = 1.0 end; end @testset "nothing on non-root" begin - me, dims = init_global_grid(nx, 1, 1, overlapx=0, quiet=true, init_MPI=false); + me, dims = init_global_grid(nx, 1, 1, overlaps=(0,0,0), quiet=true, init_MPI=false); P = zeros(nx); P_g = (me == 0) ? zeros(nx*dims[1]) : nothing P .= [x_g(ix,dx,P) for ix=1:size(P,1)]; diff --git a/test/test_init_global_grid.jl b/test/test_init_global_grid.jl index f24343e..9076a94 100644 --- a/test/test_init_global_grid.jl +++ b/test/test_init_global_grid.jl @@ -36,6 +36,7 @@ nz = 1; @test GG.global_grid().nxyz == [nx, ny, nz] @test GG.global_grid().dims == dims @test GG.global_grid().overlaps == [2, 2, 2] + @test GG.global_grid().halowidths== [1, 1, 1] @test GG.global_grid().nprocs == nprocs @test GG.global_grid().me == me @test GG.global_grid().coords == coords @@ -73,16 +74,19 @@ nz = 1; end; @testset "5. initialization with non-default overlaps and one periodic boundary" begin - nz = 8; - olz = 3; + nz = 10; olx = 3; - init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlapx=olx, overlapz=olz, quiet=true, init_MPI=false); + oly = 0; + olz = 4; + init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlaps=(olx, oly, olz), quiet=true, init_MPI=false); @testset "initialized" begin @test GG.grid_is_initialized() end @testset "values in global grid" begin # (Checks only what is different than in the basic test.) @test GG.global_grid().nxyz_g == [nx, ny, nz-olz] # Note: olx has no effect as there is only 1 process and this boundary is not periodic. @test GG.global_grid().nxyz == [nx, ny, nz ] + @test GG.global_grid().overlaps == [olx, oly, olz] + @test GG.global_grid().halowidths== [1, 1, 2] @test GG.global_grid().neighbors == [p0 p0 0; p0 p0 0] @test GG.global_grid().periods == [0, 0, 1] end; @@ -99,7 +103,9 @@ nz = 1; @test_throws ErrorException init_global_grid(nx, 1, nz, quiet=true, init_MPI=false); # Error: ny==1, while nz>1. @test_throws ErrorException init_global_grid(nx, ny, 1, dimz=3, quiet=true, init_MPI=false); # Error: dimz>1 while nz==1. @test_throws ErrorException init_global_grid(nx, ny, 1, periodz=1, quiet=true, init_MPI=false); # Error: periodz==1 while nz==1. - @test_throws ErrorException init_global_grid(nx, ny, nz, periody=1, overlapy=3, quiet=true, init_MPI=false); # Error: periody==1 while ny<2*overlapy-1 (4<5). + @test_throws ErrorException init_global_grid(nx, ny, nz, periody=1, overlaps=(2,3,2), quiet=true, init_MPI=false); # Error: periody==1 while ny<2*overlaps[2]-1 (4<5). + @test_throws ErrorException init_global_grid(nx, ny, nz, halowidths=(1,0,1), quiet=true, init_MPI=false); # Error: halowidths[2]<1. + @test_throws ErrorException init_global_grid(nx, ny, nz, overlaps=(4,3,2), halowidths=(2,2,1), quiet=true, init_MPI=false); # Error: halowidths[2]==2 while overlaps[2]==3. @test_throws ErrorException init_global_grid(nx, ny, nz, quiet=true); # Error: MPI already initialized @testset "already initialized exception" begin init_global_grid(nx, ny, nz, quiet=true, init_MPI=false); diff --git a/test/test_tools.jl b/test/test_tools.jl index 30083f3..fdcf432 100644 --- a/test/test_tools.jl +++ b/test/test_tools.jl @@ -77,7 +77,7 @@ nprocs = MPI.Comm_size(MPI.COMM_WORLD); Vz = zeros(nx, ny, nz+1); A = zeros(nx, ny, nz+2); Sxz = zeros(nx-2,ny-1,nz-2); - init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlapx=3, overlapz=3, quiet=true, init_MPI=false); + init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodz=1, overlaps=(3,2,3), quiet=true, init_MPI=false); @testset "nx_g / ny_g / nz_g" begin @test nx_g() == nx @test ny_g() == ny diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl index a737bc1..3a742fd 100644 --- a/test/test_update_halo.jl +++ b/test/test_update_halo.jl @@ -66,14 +66,17 @@ dz = 1.0 A2 = A; Z = zeros(ComplexF64, nx-1,ny+2,nz+1); Z2 = Z; - @test_throws ErrorException update_halo!(P, Sxz, A) # Error: Sxz has no halo. - @test_throws ErrorException update_halo!(P, Sxz, A, Sxz) # Error: Sxz and Sxz have no halo. - @test_throws ErrorException update_halo!(P, A, A) # Error: A is given twice. - @test_throws ErrorException update_halo!(P, A, A2) # Error: A2 is duplicate of A (an alias; it points to the same memory). - @test_throws ErrorException update_halo!(P, A, A, A2) # Error: the second A and A2 are duplicates of the first A. - @test_throws ErrorException update_halo!(Z, Z2) # Error: Z2 is duplicate of Z (an alias; it points to the same memory). - @test_throws ErrorException update_halo!(Z, P) # Error: P is of different type than Z. - @test_throws ErrorException update_halo!(Z, P, A) # Error: P and A are of different type than Z. + @test_throws ErrorException update_halo!(P, Sxz, A) # Error: Sxz has no halo. + @test_throws ErrorException update_halo!(P, Sxz, A, Sxz) # Error: Sxz and Sxz have no halo. + @test_throws ErrorException update_halo!(A, (A=P, halowidths=(1,0,1))) # Error: P has an invalid halowidth (less than 1). + @test_throws ErrorException update_halo!(A, (A=P, halowidths=(2,2,2))) # Error: P has no halo. + @test_throws ErrorException update_halo!((A=A, halowidths=(0,3,2)), (A=P, halowidths=(2,2,2))) # Error: A and P have no halo. + @test_throws ErrorException update_halo!(P, A, A) # Error: A is given twice. + @test_throws ErrorException update_halo!(P, A, A2) # Error: A2 is duplicate of A (an alias; it points to the same memory). + @test_throws ErrorException update_halo!(P, A, A, A2) # Error: the second A and A2 are duplicates of the first A. + @test_throws ErrorException update_halo!(Z, Z2) # Error: Z2 is duplicate of Z (an alias; it points to the same memory). + @test_throws ErrorException update_halo!(Z, P) # Error: P is of different type than Z. + @test_throws ErrorException update_halo!(Z, P, A) # Error: P and A are of different type than Z. finalize_global_grid(finalize_MPI=false); end; @@ -85,6 +88,9 @@ dz = 1.0 C = zeros(Float32, nx+1, ny+1, nz+1); Z = zeros(ComplexF16, nx, ny, nz ); Y = zeros(ComplexF16, nx-1, ny+2, nz+1); + P, A, B, C, Z, Y = GG.wrap_field.((P, A, B, C, Z, Y)); + halowidths = (3,1,2); + A_hw, Z_hw = GG.wrap_field(A.A, halowidths), GG.wrap_field(Z.A, halowidths); @testset "free buffers" begin @require GG.get_sendbufs_raw() === nothing @require GG.get_recvbufs_raw() === nothing @@ -117,6 +123,18 @@ dz = 1.0 end end end; + @testset "allocate single (halowidth > 1)" begin + GG.free_update_halo_buffers(); + GG.allocate_bufs(A_hw); + max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1])); + for bufs_raw in [GG.get_sendbufs_raw(), GG.get_recvbufs_raw()] + @test length(bufs_raw) == 1 # 1 array + @test length(bufs_raw[1]) == nneighbors_per_dim # 2 neighbors per dimension + for n = 1:nneighbors_per_dim + @test length(bufs_raw[1][n]) >= max_halo_elems # required length: max halo elements in any of the dimensions + end + end + end; @testset "keep 1st, allocate 2nd" begin GG.free_update_halo_buffers(); GG.allocate_bufs(P); @@ -183,12 +201,16 @@ dz = 1.0 GG.free_update_halo_buffers(); GG.allocate_bufs(A, P); for dim = 1:ndims(A), n = 1:nneighbors_per_dim - @test all(size(sendbuf(n,dim,1,A)) .== size(A)[1:ndims(A).!=dim]) - @test all(size(recvbuf(n,dim,1,A)) .== size(A)[1:ndims(A).!=dim]) + @test all(length(sendbuf(n,dim,1,A)) .== prod(size(A)[1:ndims(A).!=dim])) + @test all(length(recvbuf(n,dim,1,A)) .== prod(size(A)[1:ndims(A).!=dim])) + @test all(size(sendbuf(n,dim,1,A))[dim] .== A.halowidths[dim]) + @test all(size(recvbuf(n,dim,1,A))[dim] .== A.halowidths[dim]) end for dim = 1:ndims(P), n = 1:nneighbors_per_dim - @test all(size(sendbuf(n,dim,2,P)) .== size(P)[1:ndims(P).!=dim]) - @test all(size(recvbuf(n,dim,2,P)) .== size(P)[1:ndims(P).!=dim]) + @test all(length(sendbuf(n,dim,2,P)) .== prod(size(P)[1:ndims(P).!=dim])) + @test all(length(recvbuf(n,dim,2,P)) .== prod(size(P)[1:ndims(P).!=dim])) + @test all(size(sendbuf(n,dim,2,P))[dim] .== P.halowidths[dim]) + @test all(size(recvbuf(n,dim,2,P))[dim] .== P.halowidths[dim]) end end; @testset "(cu/roc)sendbuf / (cu/roc)recvbuf (Complex)" begin @@ -199,12 +221,44 @@ dz = 1.0 GG.free_update_halo_buffers(); GG.allocate_bufs(Y, Z); for dim = 1:ndims(Y), n = 1:nneighbors_per_dim - @test all(size(sendbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim]) - @test all(size(recvbuf(n,dim,1,Y)) .== size(Y)[1:ndims(Y).!=dim]) + @test all(length(sendbuf(n,dim,1,Y)) .== prod(size(Y)[1:ndims(Y).!=dim])) + @test all(length(recvbuf(n,dim,1,Y)) .== prod(size(Y)[1:ndims(Y).!=dim])) + @test all(size(sendbuf(n,dim,1,Y))[dim] .== Y.halowidths[dim]) + @test all(size(recvbuf(n,dim,1,Y))[dim] .== Y.halowidths[dim]) end for dim = 1:ndims(Z), n = 1:nneighbors_per_dim - @test all(size(sendbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim]) - @test all(size(recvbuf(n,dim,2,Z)) .== size(Z)[1:ndims(Z).!=dim]) + @test all(length(sendbuf(n,dim,2,Z)) .== prod(size(Z)[1:ndims(Z).!=dim])) + @test all(length(recvbuf(n,dim,2,Z)) .== prod(size(Z)[1:ndims(Z).!=dim])) + @test all(size(sendbuf(n,dim,2,Z))[dim] .== Z.halowidths[dim]) + @test all(size(recvbuf(n,dim,2,Z))[dim] .== Z.halowidths[dim]) + end + end; + @testset "(cu/roc)sendbuf / (cu/roc)recvbuf (halowidth > 1)" begin + sendbuf, recvbuf = (GG.sendbuf, GG.recvbuf); + if array_type in ["CUDA", "AMDGPU"] + sendbuf, recvbuf = (GG.gpusendbuf, GG.gpurecvbuf); + end + GG.free_update_halo_buffers(); + GG.allocate_bufs(A_hw); + for dim = 1:ndims(A_hw), n = 1:nneighbors_per_dim + @test all(length(sendbuf(n,dim,1,A_hw)) .== prod(size(A_hw)[1:ndims(A_hw).!=dim])*A_hw.halowidths[dim]) + @test all(length(recvbuf(n,dim,1,A_hw)) .== prod(size(A_hw)[1:ndims(A_hw).!=dim])*A_hw.halowidths[dim]) + @test all(size(sendbuf(n,dim,1,A_hw))[dim] .== A_hw.halowidths[dim]) + @test all(size(recvbuf(n,dim,1,A_hw))[dim] .== A_hw.halowidths[dim]) + end + end; + @testset "(cu/roc)sendbuf / (cu/roc)recvbuf (halowidth > 1, Complex)" begin + sendbuf, recvbuf = (GG.sendbuf, GG.recvbuf); + if array_type in ["CUDA", "AMDGPU"] + sendbuf, recvbuf = (GG.gpusendbuf, GG.gpurecvbuf); + end + GG.free_update_halo_buffers(); + GG.allocate_bufs(Z_hw); + for dim = 1:ndims(Z_hw), n = 1:nneighbors_per_dim + @test all(length(sendbuf(n,dim,1,Z_hw)) .== prod(size(Z_hw)[1:ndims(Z_hw).!=dim])*Z_hw.halowidths[dim]) + @test all(length(recvbuf(n,dim,1,Z_hw)) .== prod(size(Z_hw)[1:ndims(Z_hw).!=dim])*Z_hw.halowidths[dim]) + @test all(size(sendbuf(n,dim,1,Z_hw))[dim] .== Z_hw.halowidths[dim]) + @test all(size(recvbuf(n,dim,1,Z_hw))[dim] .== Z_hw.halowidths[dim]) end end; finalize_global_grid(finalize_MPI=false); @@ -213,9 +267,10 @@ dz = 1.0 @testset "3. data transfer components" begin @testset "iwrite_sendbufs! / iread_recvbufs!" begin @testset "sendranges / recvranges ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators) - init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type); + init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(2,2,3), quiet=true, init_MPI=false, device_type=device_type); P = zeros(nx, ny, nz ); A = zeros(nx-1,ny+2,nz+1); + P, A = GG.wrap_field.((P, A)); @test GG.sendranges(1, 1, P) == [ 2:2, 1:size(P,2), 1:size(P,3)] @test GG.sendranges(2, 1, P) == [size(P,1)-1:size(P,1)-1, 1:size(P,2), 1:size(P,3)] @test GG.sendranges(1, 2, P) == [ 1:size(P,1), 2:2, 1:size(P,3)] @@ -242,27 +297,62 @@ dz = 1.0 @test GG.recvranges(2, 3, A) == [ 1:size(A,1), 1:size(A,2), size(A,3):size(A,3)] finalize_global_grid(finalize_MPI=false); end; + @testset "sendranges / recvranges (halowidth > 1, $array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators) + nx = 13; + ny = 9; + nz = 9; + init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(6,4,4), halowidths=(3,1,2), quiet=true, init_MPI=false, device_type=device_type); + P = zeros(nx, ny, nz ); + A = zeros(nx-1,ny+2,nz+1); + P, A = GG.wrap_field.((P, A)); + @test GG.sendranges(1, 1, P) == [ 4:6, 1:size(P,2), 1:size(P,3)] + @test GG.sendranges(2, 1, P) == [size(P,1)-5:size(P,1)-3, 1:size(P,2), 1:size(P,3)] + @test GG.sendranges(1, 2, P) == [ 1:size(P,1), 4:4, 1:size(P,3)] + @test GG.sendranges(2, 2, P) == [ 1:size(P,1), size(P,2)-3:size(P,2)-3, 1:size(P,3)] + @test GG.sendranges(1, 3, P) == [ 1:size(P,1), 1:size(P,2), 3:4] + @test GG.sendranges(2, 3, P) == [ 1:size(P,1), 1:size(P,2), size(P,3)-3:size(P,3)-2] + @test GG.recvranges(1, 1, P) == [ 1:3, 1:size(P,2), 1:size(P,3)] + @test GG.recvranges(2, 1, P) == [ size(P,1)-2:size(P,1), 1:size(P,2), 1:size(P,3)] + @test GG.recvranges(1, 2, P) == [ 1:size(P,1), 1:1, 1:size(P,3)] + @test GG.recvranges(2, 2, P) == [ 1:size(P,1), size(P,2):size(P,2), 1:size(P,3)] + @test GG.recvranges(1, 3, P) == [ 1:size(P,1), 1:size(P,2), 1:2] + @test GG.recvranges(2, 3, P) == [ 1:size(P,1), 1:size(P,2), size(P,3)-1:size(P,3)] + @test_throws ErrorException GG.sendranges(1, 1, A) + @test_throws ErrorException GG.sendranges(2, 1, A) + @test GG.sendranges(1, 2, A) == [ 1:size(A,1), 6:6, 1:size(A,3)] + @test GG.sendranges(2, 2, A) == [ 1:size(A,1), size(A,2)-5:size(A,2)-5, 1:size(A,3)] + @test GG.sendranges(1, 3, A) == [ 1:size(A,1), 1:size(A,2), 4:5] + @test GG.sendranges(2, 3, A) == [ 1:size(A,1), 1:size(A,2), size(A,3)-4:size(A,3)-3] + @test_throws ErrorException GG.recvranges(1, 1, A) + @test_throws ErrorException GG.recvranges(2, 1, A) + @test GG.recvranges(1, 2, A) == [ 1:size(A,1), 1:1, 1:size(A,3)] + @test GG.recvranges(2, 2, A) == [ 1:size(A,1), size(A,2):size(A,2), 1:size(A,3)] + @test GG.recvranges(1, 3, A) == [ 1:size(A,1), 1:size(A,2), 1:2] + @test GG.recvranges(2, 3, A) == [ 1:size(A,1), 1:size(A,2), size(A,3)-1:size(A,3)] + finalize_global_grid(finalize_MPI=false); + end; @testset "write_h2h! / read_h2h!" begin init_global_grid(nx, ny, nz; quiet=true, init_MPI=false); P = zeros(nx, ny, nz ); P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]; P2 = zeros(size(P)); + halowidths = (1,1,1) # (dim=1) - buf = zeros(size(P,2), size(P,3)); + buf = zeros(halowidths[1], size(P,2), size(P,3)); ranges = [2:2, 1:size(P,2), 1:size(P,3)]; GG.write_h2h!(buf, P, ranges, 1); @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:]) GG.read_h2h!(buf, P2, ranges, 1); @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:]) # (dim=2) - buf = zeros(size(P,1), size(P,3)); + buf = zeros(size(P,1), halowidths[2], size(P,3)); ranges = [1:size(P,1), 3:3, 1:size(P,3)]; GG.write_h2h!(buf, P, ranges, 2); @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:]) GG.read_h2h!(buf, P2, ranges, 2); @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:]) # (dim=3) - buf = zeros(size(P,1), size(P,2)); + buf = zeros(size(P,1), size(P,2), halowidths[3]); ranges = [1:size(P,1), 1:size(P,2), 4:4]; GG.write_h2h!(buf, P, ranges, 3); @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:]) @@ -270,17 +360,47 @@ dz = 1.0 @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:]) finalize_global_grid(finalize_MPI=false); end; + @testset "write_h2h! / read_h2h! (halowidth > 1)" begin + init_global_grid(nx, ny, nz; quiet=true, init_MPI=false); + P = zeros(nx, ny, nz ); + P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]; + P2 = zeros(size(P)); + halowidths = (3,1,2); + # (dim=1) + buf = zeros(halowidths[1], size(P,2), size(P,3)); + ranges = [4:6, 1:size(P,2), 1:size(P,3)]; + GG.write_h2h!(buf, P, ranges, 1); + @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:]) + GG.read_h2h!(buf, P2, ranges, 1); + @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:]) + # (dim=2) + buf = zeros(size(P,1), halowidths[2], size(P,3)); + ranges = [1:size(P,1), 4:4, 1:size(P,3)]; + GG.write_h2h!(buf, P, ranges, 2); + @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:]) + GG.read_h2h!(buf, P2, ranges, 2); + @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:]) + # (dim=3) + buf = zeros(size(P,1), size(P,2), halowidths[3]); + ranges = [1:size(P,1), 1:size(P,2), 3:4]; + GG.write_h2h!(buf, P, ranges, 3); + @test all(buf[:] .== P[ranges[1],ranges[2],ranges[3]][:]) + GG.read_h2h!(buf, P2, ranges, 3); + @test all(buf[:] .== P2[ranges[1],ranges[2],ranges[3]][:]) + finalize_global_grid(finalize_MPI=false); + end; @static if test_cuda || test_amdgpu @testset "write_d2x! / write_d2h_async! / read_x2d! / read_h2d_async! ($array_type arrays)" for (array_type, device_type, gpuzeros, GPUArray) in zip(gpu_array_types, gpu_device_types, gpu_allocators, GPUArrayConstructors) init_global_grid(nx, ny, nz; quiet=true, init_MPI=false, device_type=device_type); P = zeros(nx, ny, nz ); P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]; P = GPUArray(P); + halowidths = (1,3,1) if array_type == "CUDA" # (dim=1) dim = 1; P2 = gpuzeros(eltype(P),size(P)); - buf = zeros(size(P,2), size(P,3)); + buf = zeros(halowidths[dim], size(P,2), size(P,3)); buf_d, buf_h = GG.register(CuArray,buf); ranges = [2:2, 1:size(P,2), 1:size(P,3)]; nthreads = (1, 1, 1); @@ -301,9 +421,9 @@ dz = 1.0 # (dim=2) dim = 2; P2 = gpuzeros(eltype(P),size(P)); - buf = zeros(size(P,1), size(P,3)); + buf = zeros(size(P,1), halowidths[dim], size(P,3)); buf_d, buf_h = GG.register(CuArray,buf); - ranges = [1:size(P,1), 3:3, 1:size(P,3)]; + ranges = [1:size(P,1), 2:4, 1:size(P,3)]; nthreads = (1, 1, 1); halosize = [r[end] - r[1] + 1 for r in ranges]; nblocks = Tuple(ceil.(Int, halosize./nthreads)); @@ -322,7 +442,7 @@ dz = 1.0 # (dim=3) dim = 3 P2 = gpuzeros(eltype(P),size(P)); - buf = zeros(size(P,1), size(P,2)); + buf = zeros(size(P,1), size(P,2), halowidths[dim]); buf_d, buf_h = GG.register(CuArray,buf); ranges = [1:size(P,1), 1:size(P,2), 4:4]; nthreads = (1, 1, 1); @@ -344,7 +464,7 @@ dz = 1.0 # (dim=1) dim = 1; P2 = gpuzeros(eltype(P),size(P)); - buf = zeros(size(P,2), size(P,3)); + buf = zeros(halowidths[dim], size(P,2), size(P,3)); buf_d = GG.register(ROCArray,buf); ranges = [2:2, 1:size(P,2), 1:size(P,3)]; nthreads = (1, 1, 1); @@ -365,9 +485,9 @@ dz = 1.0 # (dim=2) dim = 2; P2 = gpuzeros(eltype(P),size(P)); - buf = zeros(size(P,1), size(P,3)); + buf = zeros(size(P,1), halowidths[dim], size(P,3)); buf_d = GG.register(ROCArray,buf); - ranges = [1:size(P,1), 3:3, 1:size(P,3)]; + ranges = [1:size(P,1), 2:4, 1:size(P,3)]; nthreads = (1, 1, 1); halosize = [r[end] - r[1] + 1 for r in ranges]; nblocks = Tuple(ceil.(Int, halosize./nthreads)); @@ -386,7 +506,7 @@ dz = 1.0 # (dim=3) dim = 3 P2 = gpuzeros(eltype(P),size(P)); - buf = zeros(size(P,1), size(P,2)); + buf = zeros(size(P,1), size(P,2), halowidths[dim]); buf_d = GG.register(ROCArray,buf); ranges = [1:size(P,1), 1:size(P,2), 4:4]; nthreads = (1, 1, 1); @@ -394,7 +514,7 @@ dz = 1.0 nblocks = Tuple(ceil.(Int, halosize./nthreads)); @roc gridsize=nblocks groupsize=nthreads GG.write_d2x!(buf_d, P, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize(); @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:])) - @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize(); + @roc gridsize=nblocks groupsize=nthreads GG.read_x2d!(buf_d, P2, ranges[1], ranges[2], ranges[3], dim); AMDGPU.synchronize(); @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:])) # buf .= 0.0; # DEBUG: diabling read_x2x_async! tests for now in AMDGPU backend because there is an issue most likely in HIP # P2 .= 0.0; @@ -409,11 +529,12 @@ dz = 1.0 end; end @testset "iwrite_sendbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors) - init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type); + init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type); P = zeros(nx, ny, nz ); A = zeros(nx-1,ny+2,nz+1); P .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]); A .= Array([iz*1e2 + iy*1e1 + ix for ix=1:size(A,1), iy=1:size(A,2), iz=1:size(A,3)]); + P, A = GG.wrap_field.((P, A)); GG.allocate_bufs(P, A); if (array_type == "CUDA") GG.allocate_custreams(P, A); elseif (array_type == "AMDGPU") GG.allocate_rocstreams(P, A); @@ -426,10 +547,10 @@ dz = 1.0 GG.wait_iwrite(n, P, 1); GG.wait_iwrite(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[2,:,:][:]))) # DEBUG: here and later, CPUArray is needed to avoid error in AMDGPU because of mapreduce + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[3:4,:,:][:]))) # DEBUG: here and later, CPUArray is needed to avoid error in AMDGPU because of mapreduce @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)) else - @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[2,:,:][:])) + @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[3:4,:,:][:])) @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0) end n = 2 @@ -438,10 +559,10 @@ dz = 1.0 GG.wait_iwrite(n, P, 1); GG.wait_iwrite(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[end-1,:,:][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[end-3:end-2,:,:][:]))) @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== 0.0)) else - @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[end-1,:,:][:])) + @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[end-3:end-2,:,:][:])) @test all(GG.sendbuf_flat(n,dim,2,A) .== 0.0) end dim = 2 @@ -451,11 +572,11 @@ dz = 1.0 GG.wait_iwrite(n, P, 1); GG.wait_iwrite(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,2,:][:]))) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,4,:][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[:,2,:][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A.A[:,4,:][:]))) else - @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,2,:][:])) - @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,4,:][:])) + @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,2,:][:])) + @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,4,:][:])) end n = 2 GG.iwrite_sendbufs!(n, dim, P, 1); @@ -463,11 +584,11 @@ dz = 1.0 GG.wait_iwrite(n, P, 1); GG.wait_iwrite(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,end-1,:][:]))) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,end-3,:][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[:,end-1,:][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A.A[:,end-3,:][:]))) else - @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,end-1,:][:])) - @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,end-3,:][:])) + @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,end-1,:][:])) + @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,end-3,:][:])) end dim = 3 n = 1 @@ -476,11 +597,11 @@ dz = 1.0 GG.wait_iwrite(n, P, 1); GG.wait_iwrite(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,3][:]))) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,4][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[:,:,3][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A.A[:,:,4][:]))) else - @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,3][:])) - @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,4][:])) + @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,:,3][:])) + @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,:,4][:])) end n = 2 GG.iwrite_sendbufs!(n, dim, P, 1); @@ -488,18 +609,19 @@ dz = 1.0 GG.wait_iwrite(n, P, 1); GG.wait_iwrite(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P[:,:,end-2][:]))) - @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A[:,:,end-3][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,1,P) .== Array(P.A[:,:,end-2][:]))) + @test all(CPUArray(GG.gpusendbuf_flat(n,dim,2,A) .== Array(A.A[:,:,end-3][:]))) else - @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end-2][:])) - @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end-3][:])) + @test all(GG.sendbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,:,end-2][:])) + @test all(GG.sendbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,:,end-3][:])) end finalize_global_grid(finalize_MPI=false); end; @testset "iread_recvbufs! ($array_type arrays)" for (array_type, device_type, zeros, Array) in zip(array_types, device_types, allocators, ArrayConstructors) - init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type); + init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type); P = zeros(nx, ny, nz ); A = zeros(nx-1,ny+2,nz+1); + P, A = GG.wrap_field.((P, A)); GG.allocate_bufs(P, A); if (array_type == "CUDA") GG.allocate_custreams(P, A); elseif (array_type == "AMDGPU") GG.allocate_rocstreams(P, A); @@ -521,11 +643,11 @@ dz = 1.0 GG.wait_iread(n, P, 1); GG.wait_iread(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[1,:,:][:]))) - @test all(CPUArray( 0.0 .== Array(A[1,:,:][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[1:2,:,:][:]))) + @test all(CPUArray( 0.0 .== Array(A.A[1:2,:,:][:]))) else - @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[1,:,:][:])) - @test all( 0.0 .== CPUArray(A[1,:,:][:])) + @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[1:2,:,:][:])) + @test all( 0.0 .== CPUArray(A.A[1:2,:,:][:])) end n = 2 GG.iread_recvbufs!(n, dim, P, 1); @@ -533,11 +655,11 @@ dz = 1.0 GG.wait_iread(n, P, 1); GG.wait_iread(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[end,:,:][:]))) - @test all(CPUArray( 0.0 .== Array(A[end,:,:][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[end-1:end,:,:][:]))) + @test all(CPUArray( 0.0 .== Array(A.A[end-1:end,:,:][:]))) else - @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[end,:,:][:])) - @test all( 0.0 .== CPUArray(A[end,:,:][:])) + @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[end-1:end,:,:][:])) + @test all( 0.0 .== CPUArray(A.A[end-1:end,:,:][:])) end dim = 2 for n = 1:nneighbors_per_dim @@ -555,11 +677,11 @@ dz = 1.0 GG.wait_iread(n, P, 1); GG.wait_iread(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,1,:][:]))) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,1,:][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[:,1,:][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A.A[:,1,:][:]))) else - @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,1,:][:])) - @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,1,:][:])) + @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,1,:][:])) + @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,1,:][:])) end n = 2 GG.iread_recvbufs!(n, dim, P, 1); @@ -567,11 +689,11 @@ dz = 1.0 GG.wait_iread(n, P, 1); GG.wait_iread(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,end,:][:]))) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,end,:][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[:,end,:][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A.A[:,end,:][:]))) else - @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,end,:][:])) - @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,end,:][:])) + @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,end,:][:])) + @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,end,:][:])) end dim = 3 for n = 1:nneighbors_per_dim @@ -589,11 +711,11 @@ dz = 1.0 GG.wait_iread(n, P, 1); GG.wait_iread(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,1][:]))) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,1][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[:,:,1][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A.A[:,:,1][:]))) else - @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,1][:])) - @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,1][:])) + @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,:,1][:])) + @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,:,1][:])) end n = 2 GG.iread_recvbufs!(n, dim, P, 1); @@ -601,19 +723,20 @@ dz = 1.0 GG.wait_iread(n, P, 1); GG.wait_iread(n, A, 2); if (array_type=="CUDA" && GG.cudaaware_MPI(dim)) || (array_type=="AMDGPU" && GG.amdgpuaware_MPI(dim)) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P[:,:,end][:]))) - @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A[:,:,end][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,1,P) .== Array(P.A[:,:,end][:]))) + @test all(CPUArray(GG.gpurecvbuf_flat(n,dim,2,A) .== Array(A.A[:,:,end][:]))) else - @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P[:,:,end][:])) - @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A[:,:,end][:])) + @test all(GG.recvbuf_flat(n,dim,1,P) .== CPUArray(P.A[:,:,end][:])) + @test all(GG.recvbuf_flat(n,dim,2,A) .== CPUArray(A.A[:,:,end][:])) end finalize_global_grid(finalize_MPI=false); end; if (nprocs==1) @testset "sendrecv_halo_local ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators) - init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapz=3, quiet=true, init_MPI=false, device_type=device_type); + init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type); P = zeros(nx, ny, nz ); A = zeros(nx-1,ny+2,nz+1); + P, A = GG.wrap_field.((P, A)); GG.allocate_bufs(P, A); dim = 1 for n = 1:nneighbors_per_dim @@ -696,9 +819,10 @@ dz = 1.0 end; if (nprocs>1) @testset "irecv_halo! / isend_halo ($array_type arrays)" for (array_type, device_type, zeros) in zip(array_types, device_types, allocators) - me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, quiet=true, init_MPI=false, device_type=device_type); + me, dims, nprocs, coords, comm = init_global_grid(nx, ny, nz; dimy=1, dimz=1, periodx=1, overlaps=(4,4,4), halowidths=(2,1,2), quiet=true, init_MPI=false, device_type=device_type); P = zeros(nx,ny,nz); A = zeros(nx,ny,nz); + P, A = GG.wrap_field.((P, A)); dim = 1; GG.allocate_bufs(P, A); for n = 1:nneighbors_per_dim @@ -788,14 +912,14 @@ dz = 1.0 @test all(CPUArray(P .== P_ref)) finalize_global_grid(finalize_MPI=false); end; - @testset "3D (non-default overlap)" begin - init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=4, overlapz=3, quiet=true, init_MPI=false, device_type=device_type); + @testset "3D (non-default overlap and halowidth)" begin + init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type); P = zeros(nx, ny, nz); P .= [z_g(iz,dz,P)*1e2 + y_g(iy,dy,P)*1e1 + x_g(ix,dx,P) for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]; P_ref = copy(P); - P[[1, end], :, :] .= 0.0; - P[ :,[1, end], :] .= 0.0; - P[ :, :,[1, end]] .= 0.0; + P[[1,2, end-1,end], :, :] .= 0.0; + P[ :,[1, end], :] .= 0.0; + P[ :, :,[1, end]] .= 0.0; P = Array(P); P_ref = Array(P_ref); @require !all(CPUArray(P .== P_ref)) @@ -868,14 +992,14 @@ dz = 1.0 @test all(CPUArray(Vz .== Vz_ref)) finalize_global_grid(finalize_MPI=false); end; - @testset "3D (non-default overlap)" begin - init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlapx=3, overlapz=3, quiet=true, init_MPI=false, device_type=device_type); + @testset "3D (non-default overlap and halowidth)" begin + init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type); Vx = zeros(nx+1,ny,nz); Vx .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)]; Vx_ref = copy(Vx); - Vx[[1, end], :, :] .= 0.0; - Vx[ :,[1, end], :] .= 0.0; - Vx[ :, :,[1, end]] .= 0.0; + Vx[[1,2, end-1,end], :, :] .= 0.0; + Vx[ :,[1, end], :] .= 0.0; + Vx[ :, :,[1, end]] .= 0.0; Vx = Array(Vx); Vx_ref = Array(Vx_ref); @require !all(CPUArray(Vx .== Vx_ref)) @@ -1051,6 +1175,31 @@ dz = 1.0 @test all(CPUArray(Vx .== Vx_ref)) finalize_global_grid(finalize_MPI=false); end; + @testset "3D (two fields simultaneously, non-default overlap and halowidth)" begin + init_global_grid(nx, ny, nz; periodx=1, periody=1, periodz=1, overlaps=(4,2,3), halowidths=(2,1,1), quiet=true, init_MPI=false, device_type=device_type); + Vz = zeros(nx,ny,nz+1); + Vz .= [z_g(iz,dz,Vz)*1e2 + y_g(iy,dy,Vz)*1e1 + x_g(ix,dx,Vz) for ix=1:size(Vz,1), iy=1:size(Vz,2), iz=1:size(Vz,3)]; + Vz_ref = copy(Vz); + Vx = zeros(nx+1,ny,nz); + Vx .= [z_g(iz,dz,Vx)*1e2 + y_g(iy,dy,Vx)*1e1 + x_g(ix,dx,Vx) for ix=1:size(Vx,1), iy=1:size(Vx,2), iz=1:size(Vx,3)]; + Vx_ref = copy(Vx); + Vz[[1,2, end-1,end], :, :] .= 0.0; + Vz[ :,[1, end], :] .= 0.0; + Vz[ :, :,[1, end]] .= 0.0; + Vx[[1,2, end-1,end], :, :] .= 0.0; + Vx[ :,[1, end], :] .= 0.0; + Vx[ :, :,[1, end]] .= 0.0; + Vz = Array(Vz); + Vz_ref = Array(Vz_ref); + Vx = Array(Vx); + Vx_ref = Array(Vx_ref); + @require !all(CPUArray(Vz .== Vz_ref)) + @require !all(CPUArray(Vx .== Vx_ref)) + update_halo!(Vz, Vx); + @test all(CPUArray(Vz .== Vz_ref)) + @test all(CPUArray(Vx .== Vx_ref)) + finalize_global_grid(finalize_MPI=false); + end; end; end; end;