Skip to content

Commit

Permalink
Merge pull request #69 from eth-cscs/halowidth
Browse files Browse the repository at this point in the history
 Generalize halo updates to support halos of any width
  • Loading branch information
omlins committed Dec 1, 2023
2 parents abcfb4c + d6f0413 commit 6bac825
Show file tree
Hide file tree
Showing 8 changed files with 429 additions and 226 deletions.
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ version = "0.13.0"

[compat]
AMDGPU = "0.5"
CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4"
CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4, 5"
LoopVectorization = "0.12"
MPI = "0.20"
julia = "1.9"
Expand All @@ -18,7 +18,8 @@ MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"

[extras]
CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
test = ["Test", "MPIPreferences"]
17 changes: 12 additions & 5 deletions src/init_global_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
- {`periodx`|`periody`|`periodz`}`::Integer=0`: whether the grid is periodic (`1`) or not (`0`) in dimension {x|y|z}.
- `quiet::Bool=false`: whether to suppress printing information like the size of the global grid (`true`) or not (`false`).
!!! note "Advanced keyword arguments"
- {`overlapx`|`overlapy`|`overlapz`}`::Integer=2`: the number of elements adjacent local grids overlap in dimension {x|y|z}. By default (value `2`), an array `A` of size (`nx`, `ny`, `nz`) on process 1 (`A_1`) overlaps the corresponding array `A` on process 2 (`A_2`) by `2` indices if the two processes are adjacent. E.g., if `overlapx=2` and process 2 is the right neighbor of process 1 in dimension x, then `A_1[end-1:end,:,:]` overlaps `A_2[1:2,:,:]`. That means, after every call `update_halo!(A)`, we have `all(A_1[end-1:end,:,:] .== A_2[1:2,:,:])` (`A_1[end,:,:]` is the halo of process 1 and `A_2[1,:,:]` is the halo of process 2). The analog applies for the dimensions y and z.
- `overlaps::Tuple{Int,Int,Int}=(2,2,2)`: the number of elements adjacent local grids overlap in dimension x, y and z. By default (value `(2,2,2)`), an array `A` of size (`nx`, `ny`, `nz`) on process 1 (`A_1`) overlaps the corresponding array `A` on process 2 (`A_2`) by `2` indices if the two processes are adjacent. E.g., if `overlaps[1]=2` and process 2 is the right neighbor of process 1 in dimension x, then `A_1[end-1:end,:,:]` overlaps `A_2[1:2,:,:]`. That means, after every call `update_halo!(A)`, we have `all(A_1[end-1:end,:,:] .== A_2[1:2,:,:])` (`A_1[end,:,:]` is the halo of process 1 and `A_2[1,:,:]` is the halo of process 2). The analog applies for the dimensions y and z.
- `halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2)`: the default width of an array's halo in dimension x, y and z (must be greater than 1). The default can be overwritten per array in the function [`update_halo`](@ref).
- `disp::Integer=1`: the displacement argument to `MPI.Cart_shift` in order to determine the neighbors.
- `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology.
- `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology.
Expand All @@ -37,12 +38,13 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
See also: [`finalize_global_grid`](@ref), [`select_device`](@ref)
"""
function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlapx::Integer=2, overlapy::Integer=2, overlapz::Integer=2, disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
if grid_is_initialized() error("The global grid has already been initialized.") end
nxyz = [nx, ny, nz];
dims = [dimx, dimy, dimz];
periods = [periodx, periody, periodz];
overlaps = [overlapx, overlapy, overlapz];
overlaps = [overlaps...];
halowidths = [halowidths...];
cuda_enabled = false
amdgpu_enabled = false
cudaaware_MPI = [false, false, false]
Expand Down Expand Up @@ -70,10 +72,15 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_functional() end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end
if (any(periods .∉ ((0,1),))) error("Invalid arguments: periodx, periody, and periodz must be either 0 or 1."); end
if (any(halowidths .< 1)) error("Invalid arguments: halowidths cannot be less than 1."); end
if (nx==1) error("Invalid arguments: nx can never be 1.") end
if (ny==1 && nz>1) error("Invalid arguments: ny cannot be 1 if nz is greater than 1.") end
if (any((nxyz .== 1) .& (dims .>1 ))) error("Incoherent arguments: if nx, ny, or nz is 1, then the corresponding dimx, dimy or dimz must not be set (or set 0 or 1)."); end
if (any((nxyz .< 2 .* overlaps .- 1) .& (periods .> 0))) error("Incoherent arguments: if nx, ny, or nz is smaller than 2*overlapx-1, 2*overlapy-1 or 2*overlapz-1, respectively, then the corresponding periodx, periody or periodz must not be set (or set 0)."); end
if (any((nxyz .< 2 .* overlaps .- 1) .& (periods .> 0))) error("Incoherent arguments: if nx, ny, or nz is smaller than 2*overlaps[1]-1, 2*overlaps[2]-1 or 2*overlaps[3]-1, respectively, then the corresponding periodx, periody or periodz must not be set (or set 0)."); end
if (any((overlaps .> 0) .& (halowidths .> overlaps2))) error("Incoherent arguments: if overlap is greater than 0, then halowidth cannot be greater than overlap÷2, in each dimension."); end
dims[(nxyz.==1).&(dims.==0)] .= 1; # Setting any of nxyz to 1, means that the corresponding dimension must also be 1 in the global grid. Thus, the corresponding dims entry must be 1.
if (init_MPI) # NOTE: init MPI only, once the input arguments have been checked.
if (MPI.Initialized()) error("MPI is already initialized. Set the argument 'init_MPI=false'."); end
Expand All @@ -91,7 +98,7 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp);
end
nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2
set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet));
set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet));
if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]))"); end
if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end
init_timing_functions();
Expand Down
41 changes: 34 additions & 7 deletions src/shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,23 @@ const DEVICE_TYPE_AMDGPU = "AMDGPU"
##------
## TYPES

const GGInt = Cint
const GGNumber = Number
const GGArray{T,N} = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
const GGInt = Cint
const GGNumber = Number
const GGArray{T,N} = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
const GGField{T,N,T_array} = NamedTuple{(:A, :halowidths), Tuple{T_array, Tuple{GGInt,GGInt,GGInt}}} where {T_array<:GGArray{T,N}}
const GGFieldConvertible{T,N,T_array} = NamedTuple{(:A, :halowidths), <:Tuple{T_array, Tuple{T2,T2,T2}}} where {T_array<:GGArray{T,N}, T2<:Integer}
const GGField{}(t::NamedTuple) = GGField{eltype(t.A),ndims(t.A),typeof(t.A)}((t.A, GGInt.(t.halowidths)))
const CPUField{T,N} = GGField{T,N,Array{T,N}}
const CuField{T,N} = GGField{T,N,CuArray{T,N}}
const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}

"An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests)
struct GlobalGrid
nxyz_g::Vector{GGInt}
nxyz::Vector{GGInt}
dims::Vector{GGInt}
overlaps::Vector{GGInt}
halowidths::Vector{GGInt}
nprocs::GGInt
me::GGInt
coords::Vector{GGInt}
Expand All @@ -63,7 +70,7 @@ struct GlobalGrid
loopvectorization::Vector{Bool}
quiet::Bool
end
const GLOBAL_GRID_NULL = GlobalGrid(GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], -1, -1, GGInt[-1,-1,-1], GGInt[-1 -1 -1; -1 -1 -1], GGInt[-1,-1,-1], -1, -1, MPI.COMM_NULL, false, false, [false,false,false], [false,false,false], [false,false,false], false)
const GLOBAL_GRID_NULL = GlobalGrid(GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], -1, -1, GGInt[-1,-1,-1], GGInt[-1 -1 -1; -1 -1 -1], GGInt[-1,-1,-1], -1, -1, MPI.COMM_NULL, false, false, [false,false,false], [false,false,false], [false,false,false], false)

# Macro to switch on/off check_initialized() for performance reasons (potentially relevant for tools.jl).
macro check_initialized() :(check_initialized();) end #FIXME: Alternative: macro check_initialized() end
Expand Down Expand Up @@ -92,6 +99,8 @@ me() = global_grid().me
comm() = global_grid().comm
ol(dim::Integer) = global_grid().overlaps[dim]
ol(dim::Integer, A::GGArray) = global_grid().overlaps[dim] + (size(A,dim) - global_grid().nxyz[dim])
ol(A::GGArray) = (ol(dim, A) for dim in 1:ndims(A))
hw_default() = global_grid().halowidths
neighbors(dim::Integer) = global_grid().neighbors[:,dim]
neighbor(n::Integer, dim::Integer) = global_grid().neighbors[n,dim]
cuda_enabled() = global_grid().cuda_enabled
Expand All @@ -103,14 +112,32 @@ amdgpuaware_MPI(dim::Integer) = global_grid().amdgpuaware_MPI[dim]
loopvectorization() = global_grid().loopvectorization
loopvectorization(dim::Integer) = global_grid().loopvectorization[dim]
has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.PROC_NULL
any_array(fields::GGArray...) = any([is_array(A) for A in fields])
any_cuarray(fields::GGArray...) = any([is_cuarray(A) for A in fields])
any_rocarray(fields::GGArray...) = any([is_rocarray(A) for A in fields])
any_array(fields::GGField...) = any([is_array(A.A) for A in fields])
any_cuarray(fields::GGField...) = any([is_cuarray(A.A) for A in fields])
any_rocarray(fields::GGField...) = any([is_rocarray(A.A) for A in fields])
is_array(A::GGArray) = typeof(A) <: Array
is_cuarray(A::GGArray) = typeof(A) <: CuArray #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
is_rocarray(A::GGArray) = typeof(A) <: ROCArray #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.


##--------------------------------------------------------------------------------
## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS

wrap_field(A::GGField) = A
wrap_field(A::GGFieldConvertible) = GGField(A)
wrap_field(A::Array, hw::Tuple) = CPUField{eltype(A), ndims(A)}((A, hw))
wrap_field(A::CuArray, hw::Tuple) = CuField{eltype(A), ndims(A)}((A, hw))
wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw))
wrap_field(A::GGArray, hw::Integer...) = wrap_field(A, hw)
wrap_field(A::GGArray) = wrap_field(A, hw_default()...)

Base.size(A::Union{GGField, CPUField, CuField, ROCField}) = Base.size(A.A)
Base.size(A::Union{GGField, CPUField, CuField, ROCField}, args...) = Base.size(A.A, args...)
Base.length(A::Union{GGField, CPUField, CuField, ROCField}) = Base.length(A.A)
Base.ndims(A::Union{GGField, CPUField, CuField, ROCField}) = Base.ndims(A.A)
Base.eltype(A::Union{GGField, CPUField, CuField, ROCField}) = Base.eltype(A.A)


##---------------
## CUDA functions

Expand Down
Loading

0 comments on commit 6bac825

Please sign in to comment.