diff --git a/.github/workflows/Aqua.yml b/.github/workflows/Aqua.yml index bfc059b..c660dbd 100644 --- a/.github/workflows/Aqua.yml +++ b/.github/workflows/Aqua.yml @@ -9,7 +9,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@latest with: version: '1' diff --git a/Project.toml b/Project.toml index 7b0f954..f6cdf28 100644 --- a/Project.toml +++ b/Project.toml @@ -13,7 +13,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [compat] CEnum = "0.5" CUDA = "5.4.0" -CUDSS_jll = "0.4.0" +CUDSS_jll = "0.5.0" julia = "1.6" LinearAlgebra = "1.6" SparseArrays = "1.6" diff --git a/gen/Project.toml b/gen/Project.toml index 416896a..419e958 100644 --- a/gen/Project.toml +++ b/gen/Project.toml @@ -5,6 +5,6 @@ Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31" JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" [compat] -CUDA_SDK_jll = "12.6.3" -CUDSS_jll = "0.4.0" -julia = "1.6" +CUDA_SDK_jll = "12.8.0" +CUDSS_jll = "0.5.0" +julia = "1.10" diff --git a/gen/wrapper.jl b/gen/wrapper.jl index 0603d5e..c7ca726 100644 --- a/gen/wrapper.jl +++ b/gen/wrapper.jl @@ -166,7 +166,7 @@ function main() # create context headers = ["$cudss/cudss.h"] - targets = ["$cudss/cudss.h", "$cudss/cudss_distributed_interface.h"] + targets = ["$cudss/cudss.h"] # ["$cudss/cudss_distributed_interface.h", "$cudss/cudss_threading_interface.h"] ctx = create_context(headers, args, options) # run generator diff --git a/src/helpers.jl b/src/helpers.jl index 053e936..8f9906b 100644 --- a/src/helpers.jl +++ b/src/helpers.jl @@ -132,7 +132,7 @@ mutable struct CudssBatchedMatrix{T,M} ld = nrows Mptrs = unsafe_cudss_batch(b) M = typeof(Mptrs) - cudssMatrixCreateBatchDn(matrix_ref, nbatch, nrows, ncols, ld, Mptrs, T, 'C') + cudssMatrixCreateBatchDn(matrix_ref, nbatch, nrows, ncols, ld, Mptrs, Cint, T, 'C') obj = new{T,M}(T, matrix_ref[], nbatch, nrows, ncols, Cint[], Mptrs) finalizer(cudssBatchedMatrixDestroy, obj) obj @@ -147,9 +147,9 @@ mutable struct CudssBatchedMatrix{T,M} Mptrs = unsafe_cudss_batch(B) M = typeof(Mptrs) if transposed - cudssMatrixCreateBatchDn(matrix_ref, nbatch, ncols, nrows, ld, Mptrs, T, 'R') + cudssMatrixCreateBatchDn(matrix_ref, nbatch, ncols, nrows, ld, Mptrs, CInt, T, 'R') else - cudssMatrixCreateBatchDn(matrix_ref, nbatch, nrows, ncols, ld, Mptrs, T, 'C') + cudssMatrixCreateBatchDn(matrix_ref, nbatch, nrows, ncols, ld, Mptrs, Cint, T, 'C') end obj = new{T,M}(T, matrix_ref[], nbatch, nrows, ncols, Cint[], Mptrs) finalizer(cudssBatchedMatrixDestroy, obj) diff --git a/src/interfaces.jl b/src/interfaces.jl index 6c4c787..7f8f188 100644 --- a/src/interfaces.jl +++ b/src/interfaces.jl @@ -120,7 +120,10 @@ The available configuration parameters are: - `"max_lu_nnz"`: Upper limit on the number of nonzero entries in LU factors for non-symmetric matrices; - `"hybrid_mode"`: Memory mode -- `0` (default = device-only) or `1` (hybrid = host/device); - `"hybrid_device_memory_limit"`: User-defined device memory limit (number of bytes) for the hybrid memory mode; -- `"use_cuda_register_memory"`: A flag to enable (`1`) or disable (`0`) usage of `cudaHostRegister()` by the hybrid memory mode. +- `"use_cuda_register_memory"`: A flag to enable (`1`) or disable (`0`) usage of `cudaHostRegister()` by the hybrid memory mode; +- `"host_nthreads"`: Number of threads to be used by cuDSS in multi-threaded mode; +- `"hybrid_execute_mode"`: Hybrid execute mode -- `0` (default = device-only) or `1` (hybrid = host/device); +- `"pivot_epsilon_alg"`: Algorithm for the pivot epsilon calculation. The available data parameters are: - `"info"`: Device-side error information; @@ -230,7 +233,10 @@ The available configuration parameters are: - `"max_lu_nnz"`: Upper limit on the number of nonzero entries in LU factors for non-symmetric matrices; - `"hybrid_mode"`: Memory mode -- `0` (default = device-only) or `1` (hybrid = host/device); - `"hybrid_device_memory_limit"`: User-defined device memory limit (number of bytes) for the hybrid memory mode; -- `"use_cuda_register_memory"`: A flag to enable (`1`) or disable (`0`) usage of `cudaHostRegister()` by the hybrid memory mode. +- `"use_cuda_register_memory"`: A flag to enable (`1`) or disable (`0`) usage of `cudaHostRegister()` by the hybrid memory mode; +- `"host_nthreads"`: Number of threads to be used by cuDSS in multi-threaded mode; +- `"hybrid_execute_mode"`: Hybrid execute mode -- `0` (default = device-only) or `1` (hybrid = host/device); +- `"pivot_epsilon_alg"`: Algorithm for the pivot epsilon calculation. The available data parameters are: - `"info"`: Device-side error information; diff --git a/src/libcudss.jl b/src/libcudss.jl index 4b51f53..509733f 100644 --- a/src/libcudss.jl +++ b/src/libcudss.jl @@ -5,25 +5,6 @@ const cudaStream_t = CUstream const cudaDataType_t = cudaDataType const CUPTR_C_NULL = CuPtr{Ptr{Cvoid}}(0) -@cenum cudssOpType_t::UInt32 begin - CUDSS_SUM = 0 - CUDSS_MAX = 1 - CUDSS_MIN = 2 -end - -struct cudssDistributedInterface_t - cudssCommRank::Ptr{Cvoid} - cudssCommSize::Ptr{Cvoid} - cudssSend::Ptr{Cvoid} - cudssRecv::Ptr{Cvoid} - cudssBcast::Ptr{Cvoid} - cudssReduce::Ptr{Cvoid} - cudssAllreduce::Ptr{Cvoid} - cudssScatterv::Ptr{Cvoid} - cudssCommSplit::Ptr{Cvoid} - cudssCommFree::Ptr{Cvoid} -end - mutable struct cudssContext end const cudssHandle_t = Ptr{cudssContext} @@ -55,6 +36,9 @@ const cudssConfig_t = Ptr{cudssConfig} CUDSS_CONFIG_HYBRID_MODE = 11 CUDSS_CONFIG_HYBRID_DEVICE_MEMORY_LIMIT = 12 CUDSS_CONFIG_USE_CUDA_REGISTER_MEMORY = 13 + CUDSS_CONFIG_HOST_NTHREADS = 14 + CUDSS_CONFIG_HYBRID_EXECUTE_MODE = 15 + CUDSS_CONFIG_PIVOT_EPSILON_ALG = 16 end @cenum cudssDataParam_t::UInt32 begin @@ -131,8 +115,9 @@ end end @cenum cudssMatrixFormat_t::UInt32 begin - CUDSS_MFORMAT_DENSE = 0 - CUDSS_MFORMAT_CSR = 1 + CUDSS_MFORMAT_DENSE = 1 + CUDSS_MFORMAT_CSR = 2 + CUDSS_MFORMAT_BATCH = 4 end struct cudssDeviceMemHandler_t @@ -193,6 +178,12 @@ end commLibFileName::Cstring)::cudssStatus_t end +@checked function cudssSetThreadingLayer(handle, thrLibFileName) + initialize_context() + @gcsafe_ccall libcudss.cudssSetThreadingLayer(handle::cudssHandle_t, + thrLibFileName::Cstring)::cudssStatus_t +end + @checked function cudssConfigCreate(solverConfig) initialize_context() @gcsafe_ccall libcudss.cudssConfigCreate(solverConfig::Ptr{cudssConfig_t})::cudssStatus_t @@ -257,12 +248,13 @@ end end @checked function cudssMatrixCreateBatchDn(matrix, batchCount, nrows, ncols, ld, values, - valueType, layout) + indexType, valueType, layout) initialize_context() @gcsafe_ccall libcudss.cudssMatrixCreateBatchDn(matrix::Ptr{cudssMatrix_t}, batchCount::Int64, nrows::Ptr{Cvoid}, ncols::Ptr{Cvoid}, ld::Ptr{Cvoid}, values::CuPtr{Ptr{Cvoid}}, + indexType::cudaDataType_t, valueType::cudaDataType_t, layout::cudssLayout_t)::cudssStatus_t end @@ -330,8 +322,8 @@ end values::CuPtr{Cvoid})::cudssStatus_t end -@checked function cudssMatrixGetBatchDn(matrix, batchCount, nrows, ncols, ld, values, type, - layout) +@checked function cudssMatrixGetBatchDn(matrix, batchCount, nrows, ncols, ld, values, + indexType, valueType, layout) initialize_context() @gcsafe_ccall libcudss.cudssMatrixGetBatchDn(matrix::cudssMatrix_t, batchCount::Ptr{Int64}, @@ -339,7 +331,8 @@ end ncols::Ptr{Ptr{Cvoid}}, ld::Ptr{Ptr{Cvoid}}, values::Ptr{CuPtr{Ptr{Cvoid}}}, - type::Ptr{cudaDataType_t}, + indexType::Ptr{cudaDataType_t}, + valueType::Ptr{cudaDataType_t}, layout::Ptr{cudssLayout_t})::cudssStatus_t end @@ -382,7 +375,7 @@ end @checked function cudssMatrixGetFormat(matrix, format) initialize_context() @gcsafe_ccall libcudss.cudssMatrixGetFormat(matrix::cudssMatrix_t, - format::Ptr{cudssMatrixFormat_t})::cudssStatus_t + format::Ptr{Cint})::cudssStatus_t end @checked function cudssGetDeviceMemHandler(handle, handler) diff --git a/src/types.jl b/src/types.jl index bbac262..2a78c49 100644 --- a/src/types.jl +++ b/src/types.jl @@ -7,7 +7,8 @@ const CUDSS_DATA_PARAMETERS = ("info", "lu_nnz", "npivots", "inertia", "perm_reo const CUDSS_CONFIG_PARAMETERS = ("reordering_alg", "factorization_alg", "solve_alg", "matching_type", "solve_mode", "ir_n_steps", "ir_tol", "pivot_type", "pivot_threshold", "pivot_epsilon", "max_lu_nnz", "hybrid_mode", "hybrid_device_memory_limit", - "use_cuda_register_memory") + "use_cuda_register_memory", "host_nthreads", "hybrid_execute_mode", + "pivot_epsilon_alg") const CUDSS_TYPES = Dict{String, DataType}( # data type @@ -38,7 +39,10 @@ const CUDSS_TYPES = Dict{String, DataType}( "max_lu_nnz" => Int64, "hybrid_mode" => Cint, "hybrid_device_memory_limit" => Int64, - "use_cuda_register_memory" => Cint + "use_cuda_register_memory" => Cint, + "host_nthreads" => Cint, + "hybrid_execute_mode" => Cint, + "pivot_epsilon_alg" => cudssAlgType_t, ) ## config type @@ -72,6 +76,12 @@ function Base.convert(::Type{cudssConfigParam_t}, config::String) return CUDSS_CONFIG_HYBRID_DEVICE_MEMORY_LIMIT elseif config == "use_cuda_register_memory" return CUDSS_CONFIG_USE_CUDA_REGISTER_MEMORY + elseif config == "host_nthreads" + return CUDSS_CONFIG_HOST_NTHREADS + elseif config == "hybrid_execute_mode" + return CUDSS_CONFIG_HYBRID_EXECUTE_MODE + elseif config == "pivot_epsilon_alg" + return CUDSS_CONFIG_PIVOT_EPSILON_ALG else throw(ArgumentError("Unknown config parameter $config")) end @@ -226,6 +236,8 @@ function Base.convert(::Type{cudssMatrixFormat_t}, format::Char) return CUDSS_MFORMAT_DENSE elseif format == 'S' return CUDSS_MFORMAT_CSR + elseif format == 'B' + return CUDSS_MFORMAT_BATCH else throw(ArgumentError("Unknown format $format")) end diff --git a/test/test_batched_cudss.jl b/test/test_batched_cudss.jl index 0b26a57..e4c8d90 100644 --- a/test/test_batched_cudss.jl +++ b/test/test_batched_cudss.jl @@ -6,9 +6,9 @@ function cudss_batched_dense() A_cpu = rand(T, n) A_gpu = [CuVector(A_cpu)] matrix = CudssBatchedMatrix(A_gpu) - format = Ref{CUDSS.cudssMatrixFormat_t}() + format = Ref{Cint}() CUDSS.cudssMatrixGetFormat(matrix, format) - @test format[] == CUDSS.CUDSS_MFORMAT_DENSE + @test format[] == CUDSS.CUDSS_MFORMAT_BATCH A_cpu2 = rand(T, n) A_gpu2 = [CuVector(A_cpu2)] @@ -19,9 +19,9 @@ function cudss_batched_dense() A_cpu = rand(T, n, p) A_gpu = [CuMatrix(A_cpu)] matrix = CudssBatchedMatrix(A_gpu) - format = Ref{CUDSS.cudssMatrixFormat_t}() + format = Ref{CInt}() CUDSS.cudssMatrixGetFormat(matrix, format) - @test format[] == CUDSS.CUDSS_MFORMAT_DENSE + @test format[] == CUDSS.CUDSS_MFORMAT_BATCH A_cpu2 = rand(T, n, p) A_gpu2 = [CuMatrix(A_cpu2)] @@ -39,9 +39,9 @@ function cudss_batched_sparse() @testset "view = $view" for view in ('L', 'U', 'F') @testset "structure = $structure" for structure in ("G", "S", "H", "SPD", "HPD") matrix = CudssBatchedMatrix(A_gpu, structure, view) - format = Ref{CUDSS.cudssMatrixFormat_t}() + format = Ref{Cint}() CUDSS.cudssMatrixGetFormat(matrix, format) - @test format[] == CUDSS.CUDSS_MFORMAT_CSR + @test format[] == CUDSS.CUDSS_MFORMAT_BATCH A_cpu2 = sprand(T, n, n, 1.0) A_cpu2 = A_cpu2 + A_cpu2' diff --git a/test/test_cudss.jl b/test/test_cudss.jl index 192dc21..6a707a9 100644 --- a/test/test_cudss.jl +++ b/test/test_cudss.jl @@ -1,5 +1,5 @@ function cudss_version() - @test CUDSS.version() >= v"0.4.0" + @test CUDSS.version() >= v"0.5.0" end function cudss_dense() @@ -10,7 +10,7 @@ function cudss_dense() A_cpu = rand(T, n) A_gpu = CuVector(A_cpu) matrix = CudssMatrix(A_gpu) - format = Ref{CUDSS.cudssMatrixFormat_t}() + format = Ref{Cint}() CUDSS.cudssMatrixGetFormat(matrix, format) @test format[] == CUDSS.CUDSS_MFORMAT_DENSE @@ -23,7 +23,7 @@ function cudss_dense() A_cpu = rand(T, n, p) A_gpu = CuMatrix(A_cpu) matrix = CudssMatrix(A_gpu) - format = Ref{CUDSS.cudssMatrixFormat_t}() + format = Ref{Cint}() CUDSS.cudssMatrixGetFormat(matrix, format) @test format[] == CUDSS.CUDSS_MFORMAT_DENSE @@ -43,7 +43,7 @@ function cudss_sparse() @testset "view = $view" for view in ('L', 'U', 'F') @testset "structure = $structure" for structure in ("G", "S", "H", "SPD", "HPD") matrix = CudssMatrix(A_gpu, structure, view) - format = Ref{CUDSS.cudssMatrixFormat_t}() + format = Ref{Cint}() CUDSS.cudssMatrixGetFormat(matrix, format) @test format[] == CUDSS.CUDSS_MFORMAT_CSR @@ -93,7 +93,9 @@ function cudss_solver() @testset "config parameter = $parameter" for parameter in CUDSS_CONFIG_PARAMETERS @testset "cudss_get" begin - val = cudss_get(solver, parameter) + if parameter != "host_nthreads" + val = cudss_get(solver, parameter) + end end @testset "cudss_set" begin (parameter == "matching_type") && cudss_set(solver, parameter, 0) @@ -104,13 +106,16 @@ function cudss_solver() (parameter == "pivot_epsilon") && cudss_set(solver, parameter, 1e-12) (parameter == "max_lu_nnz") && cudss_set(solver, parameter, 10) (parameter == "hybrid_device_memory_limit") && cudss_set(solver, parameter, 2048) + (parameter == "host_nthreads") && cudss_set(solver, parameter, 0) for algo in ("default", "algo1", "algo2", "algo3") (parameter == "reordering_alg") && cudss_set(solver, parameter, algo) (parameter == "factorization_alg") && cudss_set(solver, parameter, algo) (parameter == "solve_alg") && cudss_set(solver, parameter, algo) + (parameter == "pivot_epsilon_alg") && cudss_set(solver, parameter, algo) end for flag in (0, 1) (parameter == "hybrid_mode") && cudss_set(solver, parameter, flag) + (parameter == "hybrid_execute_mode") && cudss_set(solver, parameter, flag) (parameter == "use_cuda_register_memory") && cudss_set(solver, parameter, flag) end for pivoting in ('C', 'R', 'N')