From c263e948e7723130c3356a109756b580f8091aab Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 1 Dec 2023 19:33:26 +0100 Subject: [PATCH 1/3] enable device_type to be none --- src/init_global_grid.jl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl index de47075..62656cc 100644 --- a/src/init_global_grid.jl +++ b/src/init_global_grid.jl @@ -18,8 +18,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology. - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology. - `init_MPI::Bool=true`: whether to initialize MPI (`true`) or not (`false`). - - `device_type::String="auto"`: the type of the device to be used if available: "CUDA", "AMDGPU" or "auto". If `device_type` is "auto" (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) is functional; if both are functional, an error will be given if `device_type` is set as "auto". - - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA is functional. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref). + - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) is functional; if both are functional, an error will be given if `device_type` is set as `"auto"`. + - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU is functional and `device_type` not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref). For more information, refer to the documentation of MPI.jl / MPI. # Return values @@ -68,10 +68,12 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMY") loopvectorization[2] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMY"]) > 0); end if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end end - if !(device_type in [DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_AUTO") end + if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end - if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_functional() end # NOTE: cuda could be enabled/disabled depending on some additional criteria. - if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria. + if (device_type != DEVICE_TYPE_NONE) + if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_functional() end # NOTE: cuda could be enabled/disabled depending on some additional criteria. + if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria. + end if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end if (any(periods .∉ ((0,1),))) error("Invalid arguments: periodx, periody, and periodz must be either 0 or 1."); end From 57a82a2c21ee4cdcd5d4ce9412e5f71e454d1547 Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 1 Dec 2023 19:33:50 +0100 Subject: [PATCH 2/3] enable device_type to be none --- src/shared.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/shared.jl b/src/shared.jl index 4a75457..1188958 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -30,6 +30,7 @@ const NDIMS_MPI = 3 # Internally, we set the number of dimens const NNEIGHBORS_PER_DIM = 2 # Number of neighbors per dimension (left neighbor + right neighbor). const GG_ALLOC_GRANULARITY = 32 # Internal buffers are allocated with a granulariy of GG_ALLOC_GRANULARITY elements in order to ensure correct reinterpretation when used for different types and to reduce amount of re-allocations. const GG_THREADCOPY_THRESHOLD = 32768 # When LoopVectorization is deactivated, then the GG_THREADCOPY_THRESHOLD defines the size in bytes upon which memory copy is performed with multiple threads. +const DEVICE_TYPE_NONE = "none" const DEVICE_TYPE_AUTO = "auto" const DEVICE_TYPE_CUDA = "CUDA" const DEVICE_TYPE_AMDGPU = "AMDGPU" From 04e85b44b881e2ff5ff3896a01fe2dc3cbf0db4e Mon Sep 17 00:00:00 2001 From: Samuel Omlin Date: Fri, 1 Dec 2023 19:34:58 +0100 Subject: [PATCH 3/3] extend device selection unit tests --- test/test_select_device.jl | 61 ++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/test/test_select_device.jl b/test/test_select_device.jl index 5f80c63..4a5b37a 100644 --- a/test/test_select_device.jl +++ b/test/test_select_device.jl @@ -16,20 +16,57 @@ nprocs = MPI.Comm_size(MPI.COMM_WORLD); # NOTE: these tests can run with any num @testset "$(basename(@__FILE__)) (processes: $nprocs)" begin @testset "1. select_device" begin - @static if test_cuda - me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="CUDA"); - gpu_id = select_device(); - @test gpu_id < length(CUDA.devices()) - finalize_global_grid(finalize_MPI=false); + @static if test_cuda && !test_amdgpu + @testset "\"CUDA\"" begin + me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="CUDA"); + gpu_id = select_device(); + @test gpu_id < length(CUDA.devices()) + finalize_global_grid(finalize_MPI=false); + end; + @testset "\"auto\"" begin + me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="auto"); + gpu_id = select_device(); + @test gpu_id < length(CUDA.devices()) + finalize_global_grid(finalize_MPI=false); + end; end - @static if test_amdgpu - me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU"); - gpu_id = select_device(); - @test gpu_id < length(AMDGPU.devices()) - finalize_global_grid(finalize_MPI=false); + @static if test_amdgpu && !test_cuda + @testset "\"AMDGPU\"" begin + me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU"); + gpu_id = select_device(); + @test gpu_id < length(AMDGPU.devices()) + finalize_global_grid(finalize_MPI=false); + end; + @testset "\"auto\"" begin + me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="auto"); + gpu_id = select_device(); + @test gpu_id < length(AMDGPU.devices()) + finalize_global_grid(finalize_MPI=false); + end; + end + @static if !(test_cuda || test_amdgpu) || (test_cuda && test_amdgpu) + @testset "\"auto\"" begin + me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="auto"); + @test_throws ErrorException select_device() + finalize_global_grid(finalize_MPI=false); + end; + end + @static if !test_cuda + @testset "\"CUDA\"" begin + me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="CUDA"); + @test_throws ErrorException select_device() + finalize_global_grid(finalize_MPI=false); + end; + end + @static if !test_amdgpu + @testset "\"AMDGPU\"" begin + me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU"); + @test_throws ErrorException select_device() + finalize_global_grid(finalize_MPI=false); + end; end - @static if !(test_cuda || test_amdgpu) - me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false); + @testset "\"none\"" begin + me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="none"); @test_throws ErrorException select_device() finalize_global_grid(finalize_MPI=false); end