diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl index f8d2617..4b30446 100644 --- a/src/AMDGPUExt/shared.jl +++ b/src/AMDGPUExt/shared.jl @@ -10,6 +10,12 @@ using AMDGPU const ROCField{T,N} = GGField{T,N,ROCArray{T,N}} +##------------------------------------ +## HANDLING OF CUDA AND AMDGPU SUPPORT + +ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = (@assert AMDGPU.functional(); return true) + + ##------------- ## SYNTAX SUGAR diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl index 6b56438..d8f7a95 100644 --- a/src/CUDAExt/shared.jl +++ b/src/CUDAExt/shared.jl @@ -10,6 +10,12 @@ using CUDA const CuField{T,N} = GGField{T,N,CuArray{T,N}} +##------------------------------------ +## HANDLING OF CUDA AND AMDGPU SUPPORT + +ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = (@assert CUDA.functional(true); return true) + + ##------------- ## SYNTAX SUGAR diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl index 62656cc..0e3ed41 100644 --- a/src/init_global_grid.jl +++ b/src/init_global_grid.jl @@ -18,8 +18,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology. - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology. - `init_MPI::Bool=true`: whether to initialize MPI (`true`) or not (`false`). - - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) is functional; if both are functional, an error will be given if `device_type` is set as `"auto"`. - - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU is functional and `device_type` not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref). + - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) was imported before ImplicitGlobalGrid; if both were imported, an error will be given if `device_type` is set as `"auto"`. + - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU was imported and `device_type` is not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref). For more information, refer to the documentation of MPI.jl / MPI. # Return values @@ -40,6 +40,8 @@ See also: [`finalize_global_grid`](@ref), [`select_device`](@ref) """ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.รท2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false) if grid_is_initialized() error("The global grid has already been initialized.") end + set_cuda_loaded() + set_amdgpu_loaded() nxyz = [nx, ny, nz]; dims = [dimx, dimy, dimz]; periods = [periodx, periody, periodz]; @@ -69,10 +71,10 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end end if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end - if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end + if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && amdgpu_loaded()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end if (device_type != DEVICE_TYPE_NONE) - if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_functional() end # NOTE: cuda could be enabled/disabled depending on some additional criteria. - if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria. + if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_loaded() end # NOTE: cuda could be enabled/disabled depending on some additional criteria. + if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria. end if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end diff --git a/src/select_device.jl b/src/select_device.jl index a571c7e..984e672 100644 --- a/src/select_device.jl +++ b/src/select_device.jl @@ -16,10 +16,8 @@ function select_device() if cuda_enabled() || amdgpu_enabled() check_initialized(); if cuda_enabled() - @assert CUDA.functional(true) nb_devices = length(CUDA.devices()) elseif amdgpu_enabled() - @assert AMDGPU.functional() nb_devices = length(AMDGPU.devices()) end comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me()) @@ -31,7 +29,7 @@ function select_device() end return device_id else - error("Cannot select a device because neither CUDA nor AMDGPU is enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded).") + error("Cannot select a device because neither CUDA nor AMDGPU is enabled (meaning that the corresponding module was not imported before ImplicitGlobalGrid).") end end diff --git a/src/shared.jl b/src/shared.jl index 27104c8..f75a135 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -1,24 +1,20 @@ import MPI -using CUDA #TODO: to be removed! -using AMDGPU #TODO: to be removed! using Base.Threads -##------------------------- +##------------------------------------ ## HANDLING OF CUDA AND AMDGPU SUPPORT -let - global cuda_functional, amdgpu_functional, set_cuda_functional, set_amdgpu_functional - _cuda_functional::Bool = false - _amdgpu_functional::Bool = false - cuda_functional()::Bool = _cuda_functional - amdgpu_functional()::Bool = _amdgpu_functional - set_cuda_functional(val::Bool) = (_cuda_functional = val;) - set_amdgpu_functional(val::Bool) = (_amdgpu_functional = val;) -end -function __init__() - set_cuda_functional(CUDA.functional()) - set_amdgpu_functional(AMDGPU.functional()) +is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing) + +let + global cuda_loaded, amdgpu_loaded, set_cuda_loaded, set_amdgpu_loaded + _cuda_loaded::Bool = false + _amdgpu_loaded::Bool = false + cuda_loaded()::Bool = _cuda_loaded + amdgpu_loaded()::Bool = _amdgpu_loaded + set_cuda_loaded() = (_cuda_loaded = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt))) + set_amdgpu_loaded() = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt))) end diff --git a/src/update_halo.jl b/src/update_halo.jl index ddcac51..9b1c48b 100644 --- a/src/update_halo.jl +++ b/src/update_halo.jl @@ -35,8 +35,7 @@ function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...) end function _update_halo!(fields::GGField...) - if (any_cuarray(fields...) && !cuda_enabled()) error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`. - if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`. + if (!cuda_enabled() && !amdgpu_enabled() && !all_arrays(fields...)) error("not all arrays are CPU arrays, but no GPU extension is loaded.") end #NOTE: in the following, it is only required to check for `cuda_enabled()`/`amdgpu_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)` or the corresponding for AMDGPU. # NOTE: the case where only one of the two extensions are loaded, but an array dad would be for the other extension is passed is very unlikely and therefore not explicitly checked here (but could be added later). allocate_bufs(fields...); if any_array(fields...) allocate_tasks(fields...); end if any_cuarray(fields...) allocate_custreams(fields...); end