From c263e948e7723130c3356a109756b580f8091aab Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 1 Dec 2023 19:33:26 +0100
Subject: [PATCH 1/3] enable device_type to be none

---
 src/init_global_grid.jl | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index de47075..62656cc 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -18,8 +18,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
     - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology.
     - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology.
     - `init_MPI::Bool=true`: whether to initialize MPI (`true`) or not (`false`).
-    - `device_type::String="auto"`: the type of the device to be used if available: "CUDA", "AMDGPU" or "auto". If `device_type` is "auto" (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) is functional; if both are functional, an error will be given if `device_type` is set as "auto".
-    - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA is functional. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref).
+    - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) is functional; if both are functional, an error will be given if `device_type` is set as `"auto"`.
+    - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU is functional and `device_type` not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref).
     For more information, refer to the documentation of MPI.jl / MPI.
 
 # Return values
@@ -68,10 +68,12 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
         if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMY") loopvectorization[2] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMY"]) > 0); end
         if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end
     end
-    if !(device_type in [DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_AUTO") end
+    if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end
     if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
-    if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_functional()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
-    if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+    if (device_type != DEVICE_TYPE_NONE)
+        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_functional()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+    end
     if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
     if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end
     if (any(periods .∉ ((0,1),))) error("Invalid arguments: periodx, periody, and periodz must be either 0 or 1."); end

From 57a82a2c21ee4cdcd5d4ce9412e5f71e454d1547 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 1 Dec 2023 19:33:50 +0100
Subject: [PATCH 2/3] enable device_type to be none

---
 src/shared.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/shared.jl b/src/shared.jl
index 4a75457..1188958 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -30,6 +30,7 @@ const NDIMS_MPI = 3                    # Internally, we set the number of dimens
 const NNEIGHBORS_PER_DIM = 2           # Number of neighbors per dimension (left neighbor + right neighbor).
 const GG_ALLOC_GRANULARITY = 32        # Internal buffers are allocated with a granulariy of GG_ALLOC_GRANULARITY elements in order to ensure correct reinterpretation when used for different types and to reduce amount of re-allocations.
 const GG_THREADCOPY_THRESHOLD = 32768  # When LoopVectorization is deactivated, then the GG_THREADCOPY_THRESHOLD defines the size in bytes upon which memory copy is performed with multiple threads.
+const DEVICE_TYPE_NONE = "none"
 const DEVICE_TYPE_AUTO = "auto"
 const DEVICE_TYPE_CUDA = "CUDA"
 const DEVICE_TYPE_AMDGPU = "AMDGPU"

From 04e85b44b881e2ff5ff3896a01fe2dc3cbf0db4e Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 1 Dec 2023 19:34:58 +0100
Subject: [PATCH 3/3] extend device selection unit tests

---
 test/test_select_device.jl | 61 ++++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/test/test_select_device.jl b/test/test_select_device.jl
index 5f80c63..4a5b37a 100644
--- a/test/test_select_device.jl
+++ b/test/test_select_device.jl
@@ -16,20 +16,57 @@ nprocs = MPI.Comm_size(MPI.COMM_WORLD); # NOTE: these tests can run with any num
 
 @testset "$(basename(@__FILE__)) (processes: $nprocs)" begin
     @testset "1. select_device" begin
-        @static if test_cuda
-            me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="CUDA");
-            gpu_id = select_device();
-            @test gpu_id < length(CUDA.devices())
-            finalize_global_grid(finalize_MPI=false);
+        @static if test_cuda && !test_amdgpu
+            @testset "\"CUDA\"" begin
+                me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="CUDA");
+                gpu_id = select_device();
+                @test gpu_id < length(CUDA.devices())
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "\"auto\"" begin
+                me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="auto");
+                gpu_id = select_device();
+                @test gpu_id < length(CUDA.devices())
+                finalize_global_grid(finalize_MPI=false);
+            end;
         end
-        @static if test_amdgpu
-            me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU");
-            gpu_id = select_device();
-            @test gpu_id < length(AMDGPU.devices())
-            finalize_global_grid(finalize_MPI=false);
+        @static if test_amdgpu && !test_cuda
+            @testset "\"AMDGPU\"" begin
+                me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU");
+                gpu_id = select_device();
+                @test gpu_id < length(AMDGPU.devices())
+                finalize_global_grid(finalize_MPI=false);
+            end;
+            @testset "\"auto\"" begin
+                me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="auto");
+                gpu_id = select_device();
+                @test gpu_id < length(AMDGPU.devices())
+                finalize_global_grid(finalize_MPI=false);
+            end;
+        end
+        @static if !(test_cuda || test_amdgpu) || (test_cuda && test_amdgpu)
+            @testset "\"auto\"" begin
+                me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="auto");
+                @test_throws ErrorException select_device()
+                finalize_global_grid(finalize_MPI=false);
+            end;
+        end
+        @static if !test_cuda
+            @testset "\"CUDA\"" begin
+                me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="CUDA");
+                @test_throws ErrorException select_device()
+                finalize_global_grid(finalize_MPI=false);
+            end;
+        end
+        @static if !test_amdgpu
+            @testset "\"AMDGPU\"" begin
+                me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU");
+                @test_throws ErrorException select_device()
+                finalize_global_grid(finalize_MPI=false);
+            end;
         end
-        @static if !(test_cuda || test_amdgpu)
-            me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false);
+        @testset "\"none\"" begin
+            me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="none");
             @test_throws ErrorException select_device()
             finalize_global_grid(finalize_MPI=false);
         end