diff --git a/api/DECAESCLI/cli_builder.jl b/api/DECAESCLI/cli_builder.jl index 58fb99b..6fd530a 100644 --- a/api/DECAESCLI/cli_builder.jl +++ b/api/DECAESCLI/cli_builder.jl @@ -8,7 +8,7 @@ function execute(cmd::Cmd) err = Pipe() process = run(pipeline(ignorestatus(cmd); stdout = err, stderr = err)) close(err.in) - return (stderr = String(read(err)), exitcode = process.exitcode) + return (; stderr = String(read(err)), exitcode = process.exitcode) end function install() @@ -50,7 +50,7 @@ function install() end function cli_script() - + # Create a batch (Windows) or bash (Unix) script for running DECAES CLI cmds = String[] # Julia executable path @@ -68,8 +68,8 @@ function cli_script() push!(cmds, "-- \"\${BASH_SOURCE[0]}\" \"\$@\"") end - # Return batch script on Windows or bash script on Linux if Sys.iswindows() + # Windows batch script """ @echo off setlocal @@ -80,6 +80,7 @@ function cli_script() endlocal """ else + # Unix shell script """ #!/usr/bin/env bash #= diff --git a/src/DECAES.jl b/src/DECAES.jl index b3108aa..e7a6dd6 100644 --- a/src/DECAES.jl +++ b/src/DECAES.jl @@ -16,7 +16,7 @@ using Statistics: Statistics, mean, std using ArgParse: ArgParse, ArgParseSettings, add_arg_group!, add_arg_table!, parse_args using Dierckx: Dierckx using DocStringExtensions: DocStringExtensions, @doc, FIELDS, SIGNATURES, TYPEDFIELDS, TYPEDSIGNATURES -using ForwardDiff: ForwardDiff, DiffResults +using ForwardDiff: ForwardDiff, DiffResults, Dual using LoggingExtras: LoggingExtras, FileLogger, LevelOverrideLogger, TeeLogger, TransformerLogger using MAT: MAT using MuladdMacro: MuladdMacro, @muladd @@ -28,6 +28,7 @@ using Parameters: Parameters, @with_kw, @with_kw_noshow using PrecompileTools: PrecompileTools, @compile_workload, @setup_workload using ProgressMeter: ProgressMeter, Progress, BarGlyphs # using Roots: Roots +# using SLEEFPirates: SLEEFPirates # using SIMD: SIMD, FloatingTypes, Vec, shufflevector using Scratch: Scratch, @get_scratch!, get_scratch! using SpecialFunctions: SpecialFunctions, erfc, erfinv @@ -64,7 +65,7 @@ export main main(["--help"]) main(["--version"]) mock_load_image() - for Reg in ["lcurve", "gcv", "chi2", "mdp"] + for Reg in ["none", "lcurve", "gcv", "chi2", "mdp"] NumVoxels = max(4, Threads.nthreads()) * default_blocksize() mock_T2_pipeline(; MatrixSize = (NumVoxels, 1, 1), Reg) end diff --git a/src/NNLS.jl b/src/NNLS.jl index fc657cf..b4efb21 100644 --- a/src/NNLS.jl +++ b/src/NNLS.jl @@ -211,7 +211,7 @@ Charles L. Lawson and Richard J. Hanson at Jet Propulsion Laboratory Revised FEB 1995 to accompany reprinting of the book by SIAM. """ @inline function construct_householder!(x::AbstractVector{T}) where {T} - if length(x) == 0 + if length(x) <= 1 return zero(T) end @@ -249,7 +249,7 @@ function apply_householder!( tau::T, ) where {T} m = length(u) - if m == 0 + if m <= 1 return nothing end @@ -282,7 +282,7 @@ function apply_householder_dual!( j1::Int, m1::Int, ) where {T} - if j1 > m1 + if j1 >= m1 return nothing end diff --git a/src/T2mapSEcorr.jl b/src/T2mapSEcorr.jl index a6b8c9b..c07d602 100644 --- a/src/T2mapSEcorr.jl +++ b/src/T2mapSEcorr.jl @@ -33,22 +33,22 @@ function T2Maps(opts::T2mapOptions{T}) where {T} convert(Array{T}, copy(thread_buffer.flip_angle_work.decay_basis)), # Default output maps - gdn = fill(T(NaN), opts.MatrixSize...), - ggm = fill(T(NaN), opts.MatrixSize...), - gva = fill(T(NaN), opts.MatrixSize...), - fnr = fill(T(NaN), opts.MatrixSize...), - snr = fill(T(NaN), opts.MatrixSize...), - alpha = fill(T(NaN), opts.MatrixSize...), + gdn = tfill(T(NaN), opts.MatrixSize...), + ggm = tfill(T(NaN), opts.MatrixSize...), + gva = tfill(T(NaN), opts.MatrixSize...), + fnr = tfill(T(NaN), opts.MatrixSize...), + snr = tfill(T(NaN), opts.MatrixSize...), + alpha = tfill(T(NaN), opts.MatrixSize...), is_alpha_provided = Ref(false), # Optional output maps - resnorm = !opts.SaveResidualNorm ? nothing : fill(T(NaN), opts.MatrixSize...), - decaycurve = !opts.SaveDecayCurve ? nothing : fill(T(NaN), opts.MatrixSize..., opts.nTE), - mu = !opts.SaveRegParam ? nothing : fill(T(NaN), opts.MatrixSize...), - chi2factor = !opts.SaveRegParam ? nothing : fill(T(NaN), opts.MatrixSize...), + resnorm = !opts.SaveResidualNorm ? nothing : tfill(T(NaN), opts.MatrixSize...), + decaycurve = !opts.SaveDecayCurve ? nothing : tfill(T(NaN), opts.MatrixSize..., opts.nTE), + mu = !opts.SaveRegParam ? nothing : tfill(T(NaN), opts.MatrixSize...), + chi2factor = !opts.SaveRegParam ? nothing : tfill(T(NaN), opts.MatrixSize...), decaybasis = !opts.SaveNNLSBasis ? nothing : opts.SetFlipAngle === nothing ? - fill(T(NaN), opts.MatrixSize..., opts.nTE, opts.nT2) : # unique decay basis set for each voxel + tfill(T(NaN), opts.MatrixSize..., opts.nTE, opts.nT2) : # unique decay basis set for each voxel convert(Array{T}, copy(thread_buffer.decay_basis)), # single decay basis set used for all voxels ) end @@ -67,7 +67,7 @@ end function T2Distributions(opts::T2mapOptions{T}) where {T} return T2Distributions(; - distributions = fill(T(NaN), opts.MatrixSize..., opts.nT2), + distributions = tfill(T(NaN), opts.MatrixSize..., opts.nT2), ) end @@ -180,7 +180,7 @@ function T2mapSEcorr!( return convert(Dict{String, Any}, maps), convert(Array{T, 4}, dist) end ntasks = opts.Threaded ? Threads.nthreads() : 1 - indices_blocks = split_indices(length(indices), default_blocksize()) + indices_blocks = split_indices(; length = length(indices), minchunksize = default_blocksize()) # Run analysis in parallel with_singlethreaded_blas() do diff --git a/src/T2partSEcorr.jl b/src/T2partSEcorr.jl index 685632a..427ff25 100644 --- a/src/T2partSEcorr.jl +++ b/src/T2partSEcorr.jl @@ -57,8 +57,7 @@ function T2partSEcorr(T2distributions::Array{T, 4}, opts::T2partOptions{T}) wher # Run T2-Part analysis ntasks = opts.Threaded ? Threads.nthreads() : 1 indices = CartesianIndices(opts.MatrixSize) - blocksize = ceil(Int, length(indices) / ntasks) - indices_blocks = split_indices(length(indices), blocksize) + indices_blocks = split_indices(; length = length(indices), minchunksize = default_blocksize(), maxpartitions = ntasks) with_singlethreaded_blas() do workerpool(with_thread_buffer, indices_blocks; ntasks, verbose = !opts.Silent) do inds, thread_buffer @@ -83,10 +82,10 @@ Base.convert(::Type{Dict{String, Any}}, maps::T2Parts) = Dict{String, Any}(Any[s function T2Parts(opts::T2partOptions{T}) where {T} return T2Parts(; - sfr = fill(T(NaN), opts.MatrixSize...), - sgm = fill(T(NaN), opts.MatrixSize...), - mfr = fill(T(NaN), opts.MatrixSize...), - mgm = fill(T(NaN), opts.MatrixSize...), + sfr = tfill(T(NaN), opts.MatrixSize...), + sgm = tfill(T(NaN), opts.MatrixSize...), + mfr = tfill(T(NaN), opts.MatrixSize...), + mgm = tfill(T(NaN), opts.MatrixSize...), ) end diff --git a/src/utils.jl b/src/utils.jl index 3129db4..22296ba 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -342,7 +342,7 @@ function tforeach(work!, allocate, x::AbstractArray; blocksize::Int = default_bl nt = Threads.nthreads() len = length(x) if nt > 1 && len > blocksize - @sync for p in split_indices(len, blocksize) + @sync for p in split_indices(; length = len, minchunksize = blocksize) Threads.@spawn allocate() do resource @simd for i in p work!(x[i], resource) @@ -360,6 +360,26 @@ function tforeach(work!, allocate, x::AbstractArray; blocksize::Int = default_bl end tforeach(f, x::AbstractArray; kwargs...) = tforeach((x, r) -> f(x), g -> g(nothing), x; kwargs...) +function tmap!(f, y::AbstractArray, x::AbstractArray; blocksize::Int = 1024^2) + if Threads.nthreads() > 1 && length(x) >= blocksize + Threads.@threads for i in eachindex(x, y) + @inbounds xi = x[i] + @inbounds y[i] = f(xi) + end + else + @inbounds @simd for i in eachindex(x, y) + xi = x[i] + y[i] = f(xi) + end + end + return x +end +tmap!(f, x::AbstractArray; kwargs...) = tmap!(f, x, x; kwargs...) + +tfill!(v, x) = tmap!(Returns(v), x) +tfill(v, sz::NTuple{N, Int}) where {N} = tfill!(v, Array{typeof(v), N}(undef, sz)) +tfill(v, sz::Int...) = tfill(v, sz) + default_blocksize() = 64 # Worker pool for allocating thread-local resources. This is a more robust alternative to @@ -421,11 +441,29 @@ function workerpool(work!, allocate, inputs, args...; kwargs...) return workerpool(work!, allocate, ch, args...; ninputs = length(inputs), kwargs...) end -function split_indices(len::Int, basesize::Int) - len′ = Int64(len) # Avoid overflow on 32-bit machines - np = max(1, div(len′, basesize)) - return [Int(1 + ((i - 1) * len′) ÷ np):Int((i * len′) ÷ np) for i in 1:np] +function split_indices(; length::Int, minchunksize::Union{Int, Nothing} = nothing, maxpartitions::Union{Int, Nothing} = nothing) + @assert minchunksize === nothing || minchunksize::Int > 0 "Basesize must be a positive integer, got minchunksize = $minchunksize" + @assert maxpartitions === nothing || maxpartitions::Int > 0 "Maximum partitions must be a positive integer, got maxpartitions = $maxpartitions" + return split_indices(length, something(minchunksize, 1)::Int, something(maxpartitions, length)::Int) +end + +function split_indices(len::Int, minchunksize::Int, maxpartitions::Int) + # Partition the range `1:len` into at most `maxpartitions` ranges with balanced lengths which are all at least `min(len, minchunksize)` + chunks = len >= minchunksize * maxpartitions ? maxpartitions : len >= minchunksize ? len ÷ minchunksize : 1 + return split_indices(len, chunks) +end + +function split_indices(len::Int, chunks::Int) + # Partition the range `1:len` into `chunks` ranges with balanced lengths + return MappedArray{UnitRange{Int64}}(SubRange(len, chunks), 1:chunks) +end + +# Produce the `i`th subrange resulting from dividing `1:len` into `chunks` partitions. Assumes 1 <= chunks <= len +struct SubRange + len::Int64 # avoid overflow on 32-bit machines + chunks::Int64 end +(r::SubRange)(i::Int) = Int(1 + ((i - 1) * r.len) ÷ r.chunks):Int((i * r.len) ÷ r.chunks) #### #### Logging diff --git a/test/interactive/compare/compare.jl b/test/interactive/compare/compare.jl index 71e8ebe..135869b 100644 --- a/test/interactive/compare/compare.jl +++ b/test/interactive/compare/compare.jl @@ -15,17 +15,19 @@ versions = [ settings_files = [ "settings.txt", ] -cli_extra_args = `--Reg lcurve --SaveResidualNorm --SaveRegParam --quiet` +cli_extra_args = `--Reg lcurve --SaveResidualNorm --SaveRegParam --quiet` for (julia_version, decaes_versions) in versions for decaes_version in decaes_versions, settings in settings_files - @info "----------------" - @info "Running DECAES with Julia $(julia_version) and DECAES $(decaes_version)" - @info "----------------" - project_dir = joinpath(@__DIR__, "envs", "julia-v" * julia_version, decaes_version) output_dir = joinpath(@__DIR__, "output", splitext(settings)[1], "julia-v" * julia_version * "_" * decaes_version) - @time run(`julia +$(julia_version) --startup-file=no --project=$(project_dir) --threads=$(Threads.nthreads()) -e "using DECAES; main()" @$(joinpath(@__DIR__, "settings", settings)) --output $(output_dir) $(cli_extra_args)`) + cmd = `julia +$(julia_version) --startup-file=no --project=$(project_dir) --threads=auto -e "using DECAES; main()" -- @$(joinpath(@__DIR__, "settings", settings)) --output $(output_dir) $(cli_extra_args)` + + @info "----------------" + @info "Running DECAES with Julia $(julia_version) and DECAES $(decaes_version)" cmd + @info "----------------" + println() + @time run(cmd) println() end end diff --git a/test/utils.jl b/test/utils.jl index 8be2acb..2c879dd 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -99,3 +99,40 @@ end @test @allocated(svdvals!(work, A)) == 0 end end + +@testset "split_indices" begin + function test_valid_partition(p, len) + @test first(first(p)) == 1 + @test last(last(p)) == len + @test all(i -> last(p[i-1]) + 1 == first(p[i]), 2:length(p)) + end + + for len in 1:20, minchunksize in 1:20 + p = DECAES.split_indices(; length = len, minchunksize) + test_valid_partition(p, len) + + if len <= minchunksize + @test length(p) == 1 + @test length(only(p)) == len + else + @test 1 <= length(p) <= len + @test length(p) == len ÷ minchunksize + @test all(c -> length(c) >= minchunksize, p) + end + end + + for len in 1:10, minchunksize in 1:10, maxpartitions in 1:10 + _basesize = min(len, max(minchunksize, len ÷ maxpartitions)) + p = DECAES.split_indices(; length = len, minchunksize, maxpartitions) + test_valid_partition(p, len) + + @test length(p) >= 1 + @test all(c -> length(c) >= _basesize, p) + if len >= minchunksize * maxpartitions + @test length(p) == maxpartitions + else + @test 1 <= length(p) <= min(len, maxpartitions) + @test length(p) == len ÷ _basesize + end + end +end