update handling of gpu support checking if extension is loaded

eth-cscs · Jan 18, 2024 · 441aacd · 441aacd
1 parent 0a4dc18
commit 441aacd
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 25 deletions.
diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
@@ -10,6 +10,12 @@ using AMDGPU
 const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
 
 
+##------------------------------------
+## HANDLING OF CUDA AND AMDGPU SUPPORT
+
+ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = (@assert AMDGPU.functional(); return true)
+
+
 ##-------------
 ## SYNTAX SUGAR
 

diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
@@ -10,6 +10,12 @@ using CUDA
 const CuField{T,N} = GGField{T,N,CuArray{T,N}}
 
 
+##------------------------------------
+## HANDLING OF CUDA AND AMDGPU SUPPORT
+
+ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = (@assert CUDA.functional(true); return true)
+
+
 ##-------------
 ## SYNTAX SUGAR
 

diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
@@ -18,8 +18,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
     - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology.
     - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology.
     - `init_MPI::Bool=true`: whether to initialize MPI (`true`) or not (`false`).
-    - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) is functional; if both are functional, an error will be given if `device_type` is set as `"auto"`.
-    - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU is functional and `device_type` not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref).
+    - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) was imported before ImplicitGlobalGrid; if both were imported, an error will be given if `device_type` is set as `"auto"`.
+    - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU was imported and `device_type` is not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref).
     For more information, refer to the documentation of MPI.jl / MPI.
 
 # Return values
@@ -40,6 +40,8 @@ See also: [`finalize_global_grid`](@ref), [`select_device`](@ref)
 """
 function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
     if grid_is_initialized() error("The global grid has already been initialized.") end
+    set_cuda_loaded()
+    set_amdgpu_loaded()
     nxyz              = [nx, ny, nz];
     dims              = [dimx, dimy, dimz];
     periods           = [periodx, periody, periodz];
@@ -69,10 +71,10 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
         if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end
     end
     if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end
-    if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
+    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && amdgpu_loaded()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
     if (device_type != DEVICE_TYPE_NONE)
-        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_functional()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
-        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_loaded()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
     end
     if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
     if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end

diff --git a/src/select_device.jl b/src/select_device.jl
@@ -16,10 +16,8 @@ function select_device()
     if cuda_enabled() || amdgpu_enabled()
         check_initialized();
         if cuda_enabled()
-            @assert CUDA.functional(true)
             nb_devices = length(CUDA.devices())
         elseif amdgpu_enabled()
-            @assert AMDGPU.functional()
             nb_devices = length(AMDGPU.devices())
         end
         comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me())
@@ -31,7 +29,7 @@ function select_device()
         end
         return device_id
     else
-        error("Cannot select a device because neither CUDA nor AMDGPU is enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded).")
+        error("Cannot select a device because neither CUDA nor AMDGPU is enabled (meaning that the corresponding module was not imported before ImplicitGlobalGrid).")
     end
 end
 

diff --git a/src/shared.jl b/src/shared.jl
@@ -1,24 +1,20 @@
 import MPI
-using CUDA   #TODO: to be removed!
-using AMDGPU #TODO: to be removed!
 using Base.Threads
 
 
-##-------------------------
+##------------------------------------
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
-let
-    global cuda_functional, amdgpu_functional, set_cuda_functional, set_amdgpu_functional
-    _cuda_functional::Bool           = false
-    _amdgpu_functional::Bool         = false
-    cuda_functional()::Bool          = _cuda_functional
-    amdgpu_functional()::Bool        = _amdgpu_functional
-    set_cuda_functional(val::Bool)   = (_cuda_functional = val;)
-    set_amdgpu_functional(val::Bool) = (_amdgpu_functional = val;)
-end
 
-function __init__()
-    set_cuda_functional(CUDA.functional())
-    set_amdgpu_functional(AMDGPU.functional())
+is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing)
+
+let
+    global cuda_loaded, amdgpu_loaded, set_cuda_loaded, set_amdgpu_loaded
+    _cuda_loaded::Bool    = false
+    _amdgpu_loaded::Bool  = false
+    cuda_loaded()::Bool   = _cuda_loaded
+    amdgpu_loaded()::Bool = _amdgpu_loaded
+    set_cuda_loaded()     = (_cuda_loaded   = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt)))
+    set_amdgpu_loaded()   = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt)))
 end
 
 

diff --git a/src/update_halo.jl b/src/update_halo.jl
@@ -35,8 +35,7 @@ function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...)
 end
 
 function _update_halo!(fields::GGField...)
-    if (any_cuarray(fields...) && !cuda_enabled())    error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end    #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`.
-    if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end  #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`.
+    if (!cuda_enabled() && !amdgpu_enabled() && !all_arrays(fields...)) error("not all arrays are CPU arrays, but no GPU extension is loaded.") end #NOTE: in the following, it is only required to check for `cuda_enabled()`/`amdgpu_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)` or the corresponding for AMDGPU. # NOTE: the case where only one of the two extensions are loaded, but an array dad would be for the other extension is passed is very unlikely and therefore not explicitly checked here (but could be added later).
     allocate_bufs(fields...);
     if any_array(fields...) allocate_tasks(fields...); end
     if any_cuarray(fields...) allocate_custreams(fields...); end