diff --git a/api/DECAESCLI/cli_builder.jl b/api/DECAESCLI/cli_builder.jl
index 58fb99b..6fd530a 100644
--- a/api/DECAESCLI/cli_builder.jl
+++ b/api/DECAESCLI/cli_builder.jl
@@ -8,7 +8,7 @@ function execute(cmd::Cmd)
     err = Pipe()
     process = run(pipeline(ignorestatus(cmd); stdout = err, stderr = err))
     close(err.in)
-    return (stderr = String(read(err)), exitcode = process.exitcode)
+    return (; stderr = String(read(err)), exitcode = process.exitcode)
 end
 
 function install()
@@ -50,7 +50,7 @@ function install()
 end
 
 function cli_script()
-
+    # Create a batch (Windows) or bash (Unix) script for running DECAES CLI
     cmds = String[]
 
     # Julia executable path
@@ -68,8 +68,8 @@ function cli_script()
         push!(cmds, "-- \"\${BASH_SOURCE[0]}\" \"\$@\"")
     end
 
-    # Return batch script on Windows or bash script on Linux
     if Sys.iswindows()
+        # Windows batch script
         """
         @echo off
         setlocal
@@ -80,6 +80,7 @@ function cli_script()
         endlocal
         """
     else
+        # Unix shell script
         """
         #!/usr/bin/env bash
         #=
diff --git a/src/DECAES.jl b/src/DECAES.jl
index b3108aa..e7a6dd6 100644
--- a/src/DECAES.jl
+++ b/src/DECAES.jl
@@ -16,7 +16,7 @@ using Statistics: Statistics, mean, std
 using ArgParse: ArgParse, ArgParseSettings, add_arg_group!, add_arg_table!, parse_args
 using Dierckx: Dierckx
 using DocStringExtensions: DocStringExtensions, @doc, FIELDS, SIGNATURES, TYPEDFIELDS, TYPEDSIGNATURES
-using ForwardDiff: ForwardDiff, DiffResults
+using ForwardDiff: ForwardDiff, DiffResults, Dual
 using LoggingExtras: LoggingExtras, FileLogger, LevelOverrideLogger, TeeLogger, TransformerLogger
 using MAT: MAT
 using MuladdMacro: MuladdMacro, @muladd
@@ -28,6 +28,7 @@ using Parameters: Parameters, @with_kw, @with_kw_noshow
 using PrecompileTools: PrecompileTools, @compile_workload, @setup_workload
 using ProgressMeter: ProgressMeter, Progress, BarGlyphs
 # using Roots: Roots
+# using SLEEFPirates: SLEEFPirates
 # using SIMD: SIMD, FloatingTypes, Vec, shufflevector
 using Scratch: Scratch, @get_scratch!, get_scratch!
 using SpecialFunctions: SpecialFunctions, erfc, erfinv
@@ -64,7 +65,7 @@ export main
         main(["--help"])
         main(["--version"])
         mock_load_image()
-        for Reg in ["lcurve", "gcv", "chi2", "mdp"]
+        for Reg in ["none", "lcurve", "gcv", "chi2", "mdp"]
             NumVoxels = max(4, Threads.nthreads()) * default_blocksize()
             mock_T2_pipeline(; MatrixSize = (NumVoxels, 1, 1), Reg)
         end
diff --git a/src/NNLS.jl b/src/NNLS.jl
index fc657cf..b4efb21 100644
--- a/src/NNLS.jl
+++ b/src/NNLS.jl
@@ -211,7 +211,7 @@ Charles L. Lawson and Richard J. Hanson at Jet Propulsion Laboratory
 Revised FEB 1995 to accompany reprinting of the book by SIAM.
 """
 @inline function construct_householder!(x::AbstractVector{T}) where {T}
-    if length(x) == 0
+    if length(x) <= 1
         return zero(T)
     end
 
@@ -249,7 +249,7 @@ function apply_householder!(
     tau::T,
 ) where {T}
     m = length(u)
-    if m == 0
+    if m <= 1
         return nothing
     end
 
@@ -282,7 +282,7 @@ function apply_householder_dual!(
     j1::Int,
     m1::Int,
 ) where {T}
-    if j1 > m1
+    if j1 >= m1
         return nothing
     end
 
diff --git a/src/T2mapSEcorr.jl b/src/T2mapSEcorr.jl
index a6b8c9b..c07d602 100644
--- a/src/T2mapSEcorr.jl
+++ b/src/T2mapSEcorr.jl
@@ -33,22 +33,22 @@ function T2Maps(opts::T2mapOptions{T}) where {T}
         convert(Array{T}, copy(thread_buffer.flip_angle_work.decay_basis)),
 
         # Default output maps
-        gdn = fill(T(NaN), opts.MatrixSize...),
-        ggm = fill(T(NaN), opts.MatrixSize...),
-        gva = fill(T(NaN), opts.MatrixSize...),
-        fnr = fill(T(NaN), opts.MatrixSize...),
-        snr = fill(T(NaN), opts.MatrixSize...),
-        alpha = fill(T(NaN), opts.MatrixSize...),
+        gdn = tfill(T(NaN), opts.MatrixSize...),
+        ggm = tfill(T(NaN), opts.MatrixSize...),
+        gva = tfill(T(NaN), opts.MatrixSize...),
+        fnr = tfill(T(NaN), opts.MatrixSize...),
+        snr = tfill(T(NaN), opts.MatrixSize...),
+        alpha = tfill(T(NaN), opts.MatrixSize...),
         is_alpha_provided = Ref(false),
 
         # Optional output maps
-        resnorm    = !opts.SaveResidualNorm ? nothing : fill(T(NaN), opts.MatrixSize...),
-        decaycurve = !opts.SaveDecayCurve ? nothing : fill(T(NaN), opts.MatrixSize..., opts.nTE),
-        mu         = !opts.SaveRegParam ? nothing : fill(T(NaN), opts.MatrixSize...),
-        chi2factor = !opts.SaveRegParam ? nothing : fill(T(NaN), opts.MatrixSize...),
+        resnorm    = !opts.SaveResidualNorm ? nothing : tfill(T(NaN), opts.MatrixSize...),
+        decaycurve = !opts.SaveDecayCurve ? nothing : tfill(T(NaN), opts.MatrixSize..., opts.nTE),
+        mu         = !opts.SaveRegParam ? nothing : tfill(T(NaN), opts.MatrixSize...),
+        chi2factor = !opts.SaveRegParam ? nothing : tfill(T(NaN), opts.MatrixSize...),
         decaybasis = !opts.SaveNNLSBasis ? nothing :
         opts.SetFlipAngle === nothing ?
-        fill(T(NaN), opts.MatrixSize..., opts.nTE, opts.nT2) : # unique decay basis set for each voxel
+        tfill(T(NaN), opts.MatrixSize..., opts.nTE, opts.nT2) : # unique decay basis set for each voxel
         convert(Array{T}, copy(thread_buffer.decay_basis)), # single decay basis set used for all voxels
     )
 end
@@ -67,7 +67,7 @@ end
 
 function T2Distributions(opts::T2mapOptions{T}) where {T}
     return T2Distributions(;
-        distributions = fill(T(NaN), opts.MatrixSize..., opts.nT2),
+        distributions = tfill(T(NaN), opts.MatrixSize..., opts.nT2),
     )
 end
 
@@ -180,7 +180,7 @@ function T2mapSEcorr!(
         return convert(Dict{String, Any}, maps), convert(Array{T, 4}, dist)
     end
     ntasks = opts.Threaded ? Threads.nthreads() : 1
-    indices_blocks = split_indices(length(indices), default_blocksize())
+    indices_blocks = split_indices(; length = length(indices), minchunksize = default_blocksize())
 
     # Run analysis in parallel
     with_singlethreaded_blas() do
diff --git a/src/T2partSEcorr.jl b/src/T2partSEcorr.jl
index 685632a..427ff25 100644
--- a/src/T2partSEcorr.jl
+++ b/src/T2partSEcorr.jl
@@ -57,8 +57,7 @@ function T2partSEcorr(T2distributions::Array{T, 4}, opts::T2partOptions{T}) wher
     # Run T2-Part analysis
     ntasks = opts.Threaded ? Threads.nthreads() : 1
     indices = CartesianIndices(opts.MatrixSize)
-    blocksize = ceil(Int, length(indices) / ntasks)
-    indices_blocks = split_indices(length(indices), blocksize)
+    indices_blocks = split_indices(; length = length(indices), minchunksize = default_blocksize(), maxpartitions = ntasks)
 
     with_singlethreaded_blas() do
         workerpool(with_thread_buffer, indices_blocks; ntasks, verbose = !opts.Silent) do inds, thread_buffer
@@ -83,10 +82,10 @@ Base.convert(::Type{Dict{String, Any}}, maps::T2Parts) = Dict{String, Any}(Any[s
 
 function T2Parts(opts::T2partOptions{T}) where {T}
     return T2Parts(;
-        sfr = fill(T(NaN), opts.MatrixSize...),
-        sgm = fill(T(NaN), opts.MatrixSize...),
-        mfr = fill(T(NaN), opts.MatrixSize...),
-        mgm = fill(T(NaN), opts.MatrixSize...),
+        sfr = tfill(T(NaN), opts.MatrixSize...),
+        sgm = tfill(T(NaN), opts.MatrixSize...),
+        mfr = tfill(T(NaN), opts.MatrixSize...),
+        mgm = tfill(T(NaN), opts.MatrixSize...),
     )
 end
 
diff --git a/src/utils.jl b/src/utils.jl
index 3129db4..22296ba 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -342,7 +342,7 @@ function tforeach(work!, allocate, x::AbstractArray; blocksize::Int = default_bl
     nt = Threads.nthreads()
     len = length(x)
     if nt > 1 && len > blocksize
-        @sync for p in split_indices(len, blocksize)
+        @sync for p in split_indices(; length = len, minchunksize = blocksize)
             Threads.@spawn allocate() do resource
                 @simd for i in p
                     work!(x[i], resource)
@@ -360,6 +360,26 @@ function tforeach(work!, allocate, x::AbstractArray; blocksize::Int = default_bl
 end
 tforeach(f, x::AbstractArray; kwargs...) = tforeach((x, r) -> f(x), g -> g(nothing), x; kwargs...)
 
+function tmap!(f, y::AbstractArray, x::AbstractArray; blocksize::Int = 1024^2)
+    if Threads.nthreads() > 1 && length(x) >= blocksize
+        Threads.@threads for i in eachindex(x, y)
+            @inbounds xi = x[i]
+            @inbounds y[i] = f(xi)
+        end
+    else
+        @inbounds @simd for i in eachindex(x, y)
+            xi = x[i]
+            y[i] = f(xi)
+        end
+    end
+    return x
+end
+tmap!(f, x::AbstractArray; kwargs...) = tmap!(f, x, x; kwargs...)
+
+tfill!(v, x) = tmap!(Returns(v), x)
+tfill(v, sz::NTuple{N, Int}) where {N} = tfill!(v, Array{typeof(v), N}(undef, sz))
+tfill(v, sz::Int...) = tfill(v, sz)
+
 default_blocksize() = 64
 
 # Worker pool for allocating thread-local resources. This is a more robust alternative to
@@ -421,11 +441,29 @@ function workerpool(work!, allocate, inputs, args...; kwargs...)
     return workerpool(work!, allocate, ch, args...; ninputs = length(inputs), kwargs...)
 end
 
-function split_indices(len::Int, basesize::Int)
-    len′ = Int64(len) # Avoid overflow on 32-bit machines
-    np = max(1, div(len′, basesize))
-    return [Int(1 + ((i - 1) * len′) ÷ np):Int((i * len′) ÷ np) for i in 1:np]
+function split_indices(; length::Int, minchunksize::Union{Int, Nothing} = nothing, maxpartitions::Union{Int, Nothing} = nothing)
+    @assert minchunksize === nothing || minchunksize::Int > 0 "Basesize must be a positive integer, got minchunksize = $minchunksize"
+    @assert maxpartitions === nothing || maxpartitions::Int > 0 "Maximum partitions must be a positive integer, got maxpartitions = $maxpartitions"
+    return split_indices(length, something(minchunksize, 1)::Int, something(maxpartitions, length)::Int)
+end
+
+function split_indices(len::Int, minchunksize::Int, maxpartitions::Int)
+    # Partition the range `1:len` into at most `maxpartitions` ranges with balanced lengths which are all at least `min(len, minchunksize)`
+    chunks = len >= minchunksize * maxpartitions ? maxpartitions : len >= minchunksize ? len ÷ minchunksize : 1
+    return split_indices(len, chunks)
+end
+
+function split_indices(len::Int, chunks::Int)
+    # Partition the range `1:len` into `chunks` ranges with balanced lengths
+    return MappedArray{UnitRange{Int64}}(SubRange(len, chunks), 1:chunks)
+end
+
+# Produce the `i`th subrange resulting from dividing `1:len` into `chunks` partitions. Assumes 1 <= chunks <= len
+struct SubRange
+    len::Int64 # avoid overflow on 32-bit machines
+    chunks::Int64
 end
+(r::SubRange)(i::Int) = Int(1 + ((i - 1) * r.len) ÷ r.chunks):Int((i * r.len) ÷ r.chunks)
 
 ####
 #### Logging
diff --git a/test/interactive/compare/compare.jl b/test/interactive/compare/compare.jl
index 71e8ebe..135869b 100644
--- a/test/interactive/compare/compare.jl
+++ b/test/interactive/compare/compare.jl
@@ -15,17 +15,19 @@ versions = [
 settings_files = [
     "settings.txt",
 ]
-cli_extra_args = `--Reg lcurve  --SaveResidualNorm --SaveRegParam --quiet`
+cli_extra_args = `--Reg lcurve --SaveResidualNorm --SaveRegParam --quiet`
 
 for (julia_version, decaes_versions) in versions
     for decaes_version in decaes_versions, settings in settings_files
-        @info "----------------"
-        @info "Running DECAES with Julia $(julia_version) and DECAES $(decaes_version)"
-        @info "----------------"
-
         project_dir = joinpath(@__DIR__, "envs", "julia-v" * julia_version, decaes_version)
         output_dir = joinpath(@__DIR__, "output", splitext(settings)[1], "julia-v" * julia_version * "_" * decaes_version)
-        @time run(`julia +$(julia_version) --startup-file=no --project=$(project_dir) --threads=$(Threads.nthreads()) -e "using DECAES; main()" @$(joinpath(@__DIR__, "settings", settings)) --output $(output_dir) $(cli_extra_args)`)
+        cmd = `julia +$(julia_version) --startup-file=no --project=$(project_dir) --threads=auto -e "using DECAES; main()" -- @$(joinpath(@__DIR__, "settings", settings)) --output $(output_dir) $(cli_extra_args)`
+
+        @info "----------------"
+        @info "Running DECAES with Julia $(julia_version) and DECAES $(decaes_version)" cmd
+        @info "----------------"
+        println()
+        @time run(cmd)
         println()
     end
 end
diff --git a/test/utils.jl b/test/utils.jl
index 8be2acb..2c879dd 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -99,3 +99,40 @@ end
         @test @allocated(svdvals!(work, A)) == 0
     end
 end
+
+@testset "split_indices" begin
+    function test_valid_partition(p, len)
+        @test first(first(p)) == 1
+        @test last(last(p)) == len
+        @test all(i -> last(p[i-1]) + 1 == first(p[i]), 2:length(p))
+    end
+
+    for len in 1:20, minchunksize in 1:20
+        p = DECAES.split_indices(; length = len, minchunksize)
+        test_valid_partition(p, len)
+
+        if len <= minchunksize
+            @test length(p) == 1
+            @test length(only(p)) == len
+        else
+            @test 1 <= length(p) <= len
+            @test length(p) == len ÷ minchunksize
+            @test all(c -> length(c) >= minchunksize, p)
+        end
+    end
+
+    for len in 1:10, minchunksize in 1:10, maxpartitions in 1:10
+        _basesize = min(len, max(minchunksize, len ÷ maxpartitions))
+        p = DECAES.split_indices(; length = len, minchunksize, maxpartitions)
+        test_valid_partition(p, len)
+
+        @test length(p) >= 1
+        @test all(c -> length(c) >= _basesize, p)
+        if len >= minchunksize * maxpartitions
+            @test length(p) == maxpartitions
+        else
+            @test 1 <= length(p) <= min(len, maxpartitions)
+            @test length(p) == len ÷ _basesize
+        end
+    end
+end