exanauts · amontoison · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/.github/workflows/Aqua.yml b/.github/workflows/Aqua.yml
@@ -9,7 +9,7 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@latest
         with:
           version: '1'

diff --git a/Project.toml b/Project.toml
@@ -13,7 +13,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 [compat]
 CEnum = "0.5"
 CUDA = "5.4.0"
-CUDSS_jll = "0.4.0"
+CUDSS_jll = "0.5.0"
 julia = "1.6"
 LinearAlgebra = "1.6"
 SparseArrays = "1.6"

diff --git a/gen/Project.toml b/gen/Project.toml
@@ -5,6 +5,6 @@ Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 
 [compat]
-CUDA_SDK_jll = "12.6.3"
-CUDSS_jll = "0.4.0"
-julia = "1.6"
+CUDA_SDK_jll = "12.8.0"
+CUDSS_jll = "0.5.0"
+julia = "1.10"
diff --git a/gen/wrapper.jl b/gen/wrapper.jl
@@ -166,7 +166,7 @@ function main()
 
     # create context
     headers = ["$cudss/cudss.h"]
-    targets = ["$cudss/cudss.h", "$cudss/cudss_distributed_interface.h"]
+    targets = ["$cudss/cudss.h"]  # ["$cudss/cudss_distributed_interface.h", "$cudss/cudss_threading_interface.h"]
     ctx = create_context(headers, args, options)
 
     # run generator

diff --git a/src/helpers.jl b/src/helpers.jl
@@ -132,7 +132,7 @@ mutable struct CudssBatchedMatrix{T,M}
         ld = nrows
         Mptrs = unsafe_cudss_batch(b)
         M = typeof(Mptrs)
-        cudssMatrixCreateBatchDn(matrix_ref, nbatch, nrows, ncols, ld, Mptrs, T, 'C')
+        cudssMatrixCreateBatchDn(matrix_ref, nbatch, nrows, ncols, ld, Mptrs, Cint, T, 'C')
         obj = new{T,M}(T, matrix_ref[], nbatch, nrows, ncols, Cint[], Mptrs)
         finalizer(cudssBatchedMatrixDestroy, obj)
         obj
@@ -147,9 +147,9 @@ mutable struct CudssBatchedMatrix{T,M}
         Mptrs = unsafe_cudss_batch(B)
         M = typeof(Mptrs)
         if transposed
-            cudssMatrixCreateBatchDn(matrix_ref, nbatch, ncols, nrows, ld, Mptrs, T, 'R')
+            cudssMatrixCreateBatchDn(matrix_ref, nbatch, ncols, nrows, ld, Mptrs, CInt, T, 'R')
         else
-            cudssMatrixCreateBatchDn(matrix_ref, nbatch, nrows, ncols, ld, Mptrs, T, 'C')
+            cudssMatrixCreateBatchDn(matrix_ref, nbatch, nrows, ncols, ld, Mptrs, Cint, T, 'C')
         end
         obj = new{T,M}(T, matrix_ref[], nbatch, nrows, ncols, Cint[], Mptrs)
         finalizer(cudssBatchedMatrixDestroy, obj)

diff --git a/src/interfaces.jl b/src/interfaces.jl
@@ -120,7 +120,10 @@ The available configuration parameters are:
 - `"max_lu_nnz"`: Upper limit on the number of nonzero entries in LU factors for non-symmetric matrices;
 - `"hybrid_mode"`: Memory mode -- `0` (default = device-only) or `1` (hybrid = host/device);
 - `"hybrid_device_memory_limit"`: User-defined device memory limit (number of bytes) for the hybrid memory mode;
-- `"use_cuda_register_memory"`: A flag to enable (`1`) or disable (`0`) usage of `cudaHostRegister()` by the hybrid memory mode.
+- `"use_cuda_register_memory"`: A flag to enable (`1`) or disable (`0`) usage of `cudaHostRegister()` by the hybrid memory mode;
+- `"host_nthreads"`: Number of threads to be used by cuDSS in multi-threaded mode;
+- `"hybrid_execute_mode"`: Hybrid execute mode -- `0` (default = device-only) or `1` (hybrid = host/device);
+- `"pivot_epsilon_alg"`: Algorithm for the pivot epsilon calculation.
 
 The available data parameters are:
 - `"info"`: Device-side error information;
@@ -230,7 +233,10 @@ The available configuration parameters are:
 - `"max_lu_nnz"`: Upper limit on the number of nonzero entries in LU factors for non-symmetric matrices;
 - `"hybrid_mode"`: Memory mode -- `0` (default = device-only) or `1` (hybrid = host/device);
 - `"hybrid_device_memory_limit"`: User-defined device memory limit (number of bytes) for the hybrid memory mode;
-- `"use_cuda_register_memory"`: A flag to enable (`1`) or disable (`0`) usage of `cudaHostRegister()` by the hybrid memory mode.
+- `"use_cuda_register_memory"`: A flag to enable (`1`) or disable (`0`) usage of `cudaHostRegister()` by the hybrid memory mode;
+- `"host_nthreads"`: Number of threads to be used by cuDSS in multi-threaded mode;
+- `"hybrid_execute_mode"`: Hybrid execute mode -- `0` (default = device-only) or `1` (hybrid = host/device);
+- `"pivot_epsilon_alg"`: Algorithm for the pivot epsilon calculation.
 
 The available data parameters are:
 - `"info"`: Device-side error information;

diff --git a/src/libcudss.jl b/src/libcudss.jl
@@ -5,25 +5,6 @@ const cudaStream_t = CUstream
 const cudaDataType_t = cudaDataType
 const CUPTR_C_NULL = CuPtr{Ptr{Cvoid}}(0)
 
-@cenum cudssOpType_t::UInt32 begin
-    CUDSS_SUM = 0
-    CUDSS_MAX = 1
-    CUDSS_MIN = 2
-end
-
-struct cudssDistributedInterface_t
-    cudssCommRank::Ptr{Cvoid}
-    cudssCommSize::Ptr{Cvoid}
-    cudssSend::Ptr{Cvoid}
-    cudssRecv::Ptr{Cvoid}
-    cudssBcast::Ptr{Cvoid}
-    cudssReduce::Ptr{Cvoid}
-    cudssAllreduce::Ptr{Cvoid}
-    cudssScatterv::Ptr{Cvoid}
-    cudssCommSplit::Ptr{Cvoid}
-    cudssCommFree::Ptr{Cvoid}
-end
-
 mutable struct cudssContext end
 
 const cudssHandle_t = Ptr{cudssContext}
@@ -55,6 +36,9 @@ const cudssConfig_t = Ptr{cudssConfig}
     CUDSS_CONFIG_HYBRID_MODE = 11
     CUDSS_CONFIG_HYBRID_DEVICE_MEMORY_LIMIT = 12
     CUDSS_CONFIG_USE_CUDA_REGISTER_MEMORY = 13
+    CUDSS_CONFIG_HOST_NTHREADS = 14
+    CUDSS_CONFIG_HYBRID_EXECUTE_MODE = 15
+    CUDSS_CONFIG_PIVOT_EPSILON_ALG = 16
 end
 
 @cenum cudssDataParam_t::UInt32 begin
@@ -131,8 +115,9 @@ end
 end
 
 @cenum cudssMatrixFormat_t::UInt32 begin
-    CUDSS_MFORMAT_DENSE = 0
-    CUDSS_MFORMAT_CSR = 1
+    CUDSS_MFORMAT_DENSE = 1
+    CUDSS_MFORMAT_CSR = 2
+    CUDSS_MFORMAT_BATCH = 4
 end
 
 struct cudssDeviceMemHandler_t
@@ -193,6 +178,12 @@ end
                                              commLibFileName::Cstring)::cudssStatus_t
 end
 
+@checked function cudssSetThreadingLayer(handle, thrLibFileName)
+    initialize_context()
+    @gcsafe_ccall libcudss.cudssSetThreadingLayer(handle::cudssHandle_t,
+                                                  thrLibFileName::Cstring)::cudssStatus_t
+end
+
 @checked function cudssConfigCreate(solverConfig)
     initialize_context()
     @gcsafe_ccall libcudss.cudssConfigCreate(solverConfig::Ptr{cudssConfig_t})::cudssStatus_t
@@ -257,12 +248,13 @@ end
 end
 
 @checked function cudssMatrixCreateBatchDn(matrix, batchCount, nrows, ncols, ld, values,
-                                           valueType, layout)
+                                           indexType, valueType, layout)
     initialize_context()
     @gcsafe_ccall libcudss.cudssMatrixCreateBatchDn(matrix::Ptr{cudssMatrix_t},
                                                     batchCount::Int64, nrows::Ptr{Cvoid},
                                                     ncols::Ptr{Cvoid}, ld::Ptr{Cvoid},
                                                     values::CuPtr{Ptr{Cvoid}},
+                                                    indexType::cudaDataType_t,
                                                     valueType::cudaDataType_t,
                                                     layout::cudssLayout_t)::cudssStatus_t
 end
@@ -330,16 +322,17 @@ end
                                                      values::CuPtr{Cvoid})::cudssStatus_t
 end
 
-@checked function cudssMatrixGetBatchDn(matrix, batchCount, nrows, ncols, ld, values, type,
-                                        layout)
+@checked function cudssMatrixGetBatchDn(matrix, batchCount, nrows, ncols, ld, values,
+                                        indexType, valueType, layout)
     initialize_context()
     @gcsafe_ccall libcudss.cudssMatrixGetBatchDn(matrix::cudssMatrix_t,
                                                  batchCount::Ptr{Int64},
                                                  nrows::Ptr{Ptr{Cvoid}},
                                                  ncols::Ptr{Ptr{Cvoid}},
                                                  ld::Ptr{Ptr{Cvoid}},
                                                  values::Ptr{CuPtr{Ptr{Cvoid}}},
-                                                 type::Ptr{cudaDataType_t},
+                                                 indexType::Ptr{cudaDataType_t},
+                                                 valueType::Ptr{cudaDataType_t},
                                                  layout::Ptr{cudssLayout_t})::cudssStatus_t
 end
 
@@ -382,7 +375,7 @@ end
 @checked function cudssMatrixGetFormat(matrix, format)
     initialize_context()
     @gcsafe_ccall libcudss.cudssMatrixGetFormat(matrix::cudssMatrix_t,
-                                                format::Ptr{cudssMatrixFormat_t})::cudssStatus_t
+                                                format::Ptr{Cint})::cudssStatus_t
 end
 
 @checked function cudssGetDeviceMemHandler(handle, handler)

diff --git a/src/types.jl b/src/types.jl
@@ -7,7 +7,8 @@ const CUDSS_DATA_PARAMETERS = ("info", "lu_nnz", "npivots", "inertia", "perm_reo
 const CUDSS_CONFIG_PARAMETERS = ("reordering_alg", "factorization_alg", "solve_alg", "matching_type",
                                  "solve_mode", "ir_n_steps", "ir_tol", "pivot_type", "pivot_threshold",
                                  "pivot_epsilon", "max_lu_nnz", "hybrid_mode", "hybrid_device_memory_limit",
-                                 "use_cuda_register_memory")
+                                 "use_cuda_register_memory", "host_nthreads", "hybrid_execute_mode",
+                                 "pivot_epsilon_alg")
 
 const CUDSS_TYPES = Dict{String, DataType}(
     # data type
@@ -38,7 +39,10 @@ const CUDSS_TYPES = Dict{String, DataType}(
     "max_lu_nnz" => Int64,
     "hybrid_mode" => Cint,
     "hybrid_device_memory_limit" => Int64,
-    "use_cuda_register_memory" => Cint
+    "use_cuda_register_memory" => Cint,
+    "host_nthreads" => Cint,
+    "hybrid_execute_mode" => Cint,
+    "pivot_epsilon_alg" => cudssAlgType_t,
 )
 
 ## config type
@@ -72,6 +76,12 @@ function Base.convert(::Type{cudssConfigParam_t}, config::String)
         return CUDSS_CONFIG_HYBRID_DEVICE_MEMORY_LIMIT
     elseif config == "use_cuda_register_memory"
         return CUDSS_CONFIG_USE_CUDA_REGISTER_MEMORY
+    elseif config == "host_nthreads"
+        return CUDSS_CONFIG_HOST_NTHREADS
+    elseif config == "hybrid_execute_mode"
+        return CUDSS_CONFIG_HYBRID_EXECUTE_MODE
+    elseif config == "pivot_epsilon_alg"
+        return CUDSS_CONFIG_PIVOT_EPSILON_ALG
     else
         throw(ArgumentError("Unknown config parameter $config"))
     end
@@ -226,6 +236,8 @@ function Base.convert(::Type{cudssMatrixFormat_t}, format::Char)
         return CUDSS_MFORMAT_DENSE
     elseif format == 'S'
         return CUDSS_MFORMAT_CSR
+    elseif format == 'B'
+        return CUDSS_MFORMAT_BATCH
     else
         throw(ArgumentError("Unknown format $format"))
     end

diff --git a/test/test_batched_cudss.jl b/test/test_batched_cudss.jl
@@ -6,9 +6,9 @@ function cudss_batched_dense()
       A_cpu = rand(T, n)
       A_gpu = [CuVector(A_cpu)]
       matrix = CudssBatchedMatrix(A_gpu)
-      format = Ref{CUDSS.cudssMatrixFormat_t}()
+      format = Ref{Cint}()
       CUDSS.cudssMatrixGetFormat(matrix, format)
-      @test format[] == CUDSS.CUDSS_MFORMAT_DENSE
+      @test format[] == CUDSS.CUDSS_MFORMAT_BATCH
 
       A_cpu2 = rand(T, n)
       A_gpu2 = [CuVector(A_cpu2)]
@@ -19,9 +19,9 @@ function cudss_batched_dense()
       A_cpu = rand(T, n, p)
       A_gpu = [CuMatrix(A_cpu)]
       matrix = CudssBatchedMatrix(A_gpu)
-      format = Ref{CUDSS.cudssMatrixFormat_t}()
+      format = Ref{CInt}()
       CUDSS.cudssMatrixGetFormat(matrix, format)
-      @test format[] == CUDSS.CUDSS_MFORMAT_DENSE
+      @test format[] == CUDSS.CUDSS_MFORMAT_BATCH
 
       A_cpu2 = rand(T, n, p)
       A_gpu2 = [CuMatrix(A_cpu2)]
@@ -39,9 +39,9 @@ function cudss_batched_sparse()
     @testset "view = $view" for view in ('L', 'U', 'F')
       @testset "structure = $structure" for structure in ("G", "S", "H", "SPD", "HPD")
         matrix = CudssBatchedMatrix(A_gpu, structure, view)
-        format = Ref{CUDSS.cudssMatrixFormat_t}()
+        format = Ref{Cint}()
         CUDSS.cudssMatrixGetFormat(matrix, format)
-        @test format[] == CUDSS.CUDSS_MFORMAT_CSR
+        @test format[] == CUDSS.CUDSS_MFORMAT_BATCH
 
         A_cpu2 = sprand(T, n, n, 1.0)
         A_cpu2 = A_cpu2 + A_cpu2'

diff --git a/test/test_cudss.jl b/test/test_cudss.jl
@@ -1,5 +1,5 @@
 function cudss_version()
-  @test CUDSS.version() >= v"0.4.0"
+  @test CUDSS.version() >= v"0.5.0"
 end
 
 function cudss_dense()
@@ -10,7 +10,7 @@ function cudss_dense()
       A_cpu = rand(T, n)
       A_gpu = CuVector(A_cpu)
       matrix = CudssMatrix(A_gpu)
-      format = Ref{CUDSS.cudssMatrixFormat_t}()
+      format = Ref{Cint}()
       CUDSS.cudssMatrixGetFormat(matrix, format)
       @test format[] == CUDSS.CUDSS_MFORMAT_DENSE
 
@@ -23,7 +23,7 @@ function cudss_dense()
       A_cpu = rand(T, n, p)
       A_gpu = CuMatrix(A_cpu)
       matrix = CudssMatrix(A_gpu)
-      format = Ref{CUDSS.cudssMatrixFormat_t}()
+      format = Ref{Cint}()
       CUDSS.cudssMatrixGetFormat(matrix, format)
       @test format[] == CUDSS.CUDSS_MFORMAT_DENSE
 
@@ -43,7 +43,7 @@ function cudss_sparse()
     @testset "view = $view" for view in ('L', 'U', 'F')
       @testset "structure = $structure" for structure in ("G", "S", "H", "SPD", "HPD")
         matrix = CudssMatrix(A_gpu, structure, view) 
-        format = Ref{CUDSS.cudssMatrixFormat_t}()
+        format = Ref{Cint}()
         CUDSS.cudssMatrixGetFormat(matrix, format)
         @test format[] == CUDSS.CUDSS_MFORMAT_CSR
 
@@ -93,7 +93,9 @@ function cudss_solver()
 
         @testset "config parameter = $parameter" for parameter in CUDSS_CONFIG_PARAMETERS
           @testset "cudss_get" begin
-            val = cudss_get(solver, parameter)
+            if parameter != "host_nthreads"
+              val = cudss_get(solver, parameter)
+            end
           end
           @testset "cudss_set" begin
             (parameter == "matching_type") && cudss_set(solver, parameter, 0)
@@ -104,13 +106,16 @@ function cudss_solver()
             (parameter == "pivot_epsilon") && cudss_set(solver, parameter, 1e-12)
             (parameter == "max_lu_nnz") && cudss_set(solver, parameter, 10)
             (parameter == "hybrid_device_memory_limit") && cudss_set(solver, parameter, 2048)
+            (parameter == "host_nthreads") && cudss_set(solver, parameter, 0)
             for algo in ("default", "algo1", "algo2", "algo3")
               (parameter == "reordering_alg") && cudss_set(solver, parameter, algo)
               (parameter == "factorization_alg") && cudss_set(solver, parameter, algo)
               (parameter == "solve_alg") && cudss_set(solver, parameter, algo)
+              (parameter == "pivot_epsilon_alg") && cudss_set(solver, parameter, algo)
             end
             for flag in (0, 1)
               (parameter == "hybrid_mode") && cudss_set(solver, parameter, flag)
+              (parameter == "hybrid_execute_mode") && cudss_set(solver, parameter, flag)
               (parameter == "use_cuda_register_memory") && cudss_set(solver, parameter, flag)
             end
             for pivoting in ('C', 'R', 'N')