From 06b54a619f6af682ea97962c911ae966842e4bea Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 21 Oct 2024 14:33:59 +0200 Subject: [PATCH] Make LongSequence and LongSubSeq use Memory This makes these two data types more lightweight, requiring only two memory allocations (one for the sequence and one for its memory) as opposed to two. It also improves data locality since loading from the sequences won't require a double load through the indirection of `Vector`. The disadvanges are twofold: First, we lose support for Julia 1.10 since Memory was introduced in Julia 1.11 Second, Vector's code to grow its underlying memory is much more optimised and tested than the manual implementation here in BioSequences. This would matter if users did a lot of resizing operations on biosequences, like `push!` or so. However, I think they don't - they are instead much more likely to create a lot of sequences. If necessary, we can always implement better resizing / growth behaviour for these types. --- .github/workflows/UnitTests.yml | 2 +- Project.toml | 2 +- src/longsequences/constructors.jl | 10 +++++----- src/longsequences/copying.jl | 11 ++++++++--- src/longsequences/longsequence.jl | 4 ++-- src/longsequences/randseq.jl | 5 +++-- src/longsequences/seqview.jl | 4 ++-- src/longsequences/transformations.jl | 29 +++++++++++++++++----------- 8 files changed, 40 insertions(+), 27 deletions(-) diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index d8ee9086..e4162cea 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: false matrix: - julia-version: ['1', '1.10'] + julia-version: ['1', '1.11'] os: [ubuntu-latest, macOS-latest, windows-latest] experimental: [false] include: diff --git a/Project.toml b/Project.toml index 42d23282..81a933a4 100644 --- a/Project.toml +++ b/Project.toml @@ -15,7 +15,7 @@ PrecompileTools = "1" Random = "1.5" StableRNGs = "0.1, 1.0" Twiddle = "1.1.1" -julia = "1.10" +julia = "1.11" [extras] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl index abdd2f94..ecab51c9 100644 --- a/src/longsequences/constructors.jl +++ b/src/longsequences/constructors.jl @@ -9,22 +9,22 @@ @inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s)) -@inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet +@inline function seq_data_len(::Type{A}, len::Integer)::Int where A <: Alphabet iszero(bits_per_symbol(A())) && return 0 - return cld(len, div(64, bits_per_symbol(A()))) + return cld(len % UInt, div(64, bits_per_symbol(A())) % UInt) % Int end function LongSequence{A}(::UndefInitializer, len::Integer) where {A<:Alphabet} if len < 0 throw(ArgumentError("len must be non-negative")) end - return LongSequence{A}(Vector{UInt64}(undef, seq_data_len(A, len)), UInt(len)) + return LongSequence{A}(Memory{UInt64}(undef, seq_data_len(A, len)), UInt(len)) end # Generic constructor function LongSequence{A}(it) where {A <: Alphabet} len = length(it) - data = Vector{UInt64}(undef, seq_data_len(A, len)) + data = Memory{UInt64}(undef, seq_data_len(A, len)) bits = zero(UInt) bitind = bitindex(BitsPerSymbol(A()), encoded_data_eltype(LongSequence{A}), 1) @inbounds for x in it @@ -41,7 +41,7 @@ function LongSequence{A}(it) where {A <: Alphabet} LongSequence{A}(data, len % UInt) end -Base.empty(::Type{T}) where {T <: LongSequence} = T(UInt[], UInt(0)) +Base.empty(::Type{T}) where {T <: LongSequence} = T(Memory{UInt64}(), UInt(0)) (::Type{T})() where {T <: LongSequence} = empty(T) # Constructors from other sequences diff --git a/src/longsequences/copying.jl b/src/longsequences/copying.jl index b98e0360..45e6ebf0 100644 --- a/src/longsequences/copying.jl +++ b/src/longsequences/copying.jl @@ -34,8 +34,12 @@ function Base.copy!(dst::SeqOrView{<:NucleicAcidAlphabet{N}}, end function _copy!(dst::LongSequence, src::LongSequence) - resize!(dst.data, length(src.data)) - copyto!(dst.data, src.data) + src_data_len = seq_data_len(src) + if length(dst.data) ≥ src_data_len + unsafe_copyto!(dst.data, 1, src.data, 1, src_data_len) + else + dst.data = copy(src.data) + end dst.len = src.len return dst end @@ -48,10 +52,11 @@ function _copy!(dst::SeqOrView{A}, src::SeqOrView) where {A <: Alphabet} end if dst.data === src.data longseq = LongSequence{A}(src) - src_ = LongSubSeq{A}(longseq.data, 1:length(longseq)) + src_ = src isa LongSequence ? longseq : LongSubSeq{A}(longseq.data, 1:length(longseq)) else src_ = src end + typeof(src) == typeof(src_) || error() # unreachable return copyto!(dst, 1, src_, 1, length(src)) end diff --git a/src/longsequences/longsequence.jl b/src/longsequences/longsequence.jl index e18f997b..1909cf1a 100644 --- a/src/longsequences/longsequence.jl +++ b/src/longsequences/longsequence.jl @@ -84,10 +84,10 @@ The same applies with `LongSequence{RNAAlphabet{4}}`, simply replace the alphabe parameter with `RNAAlphabet{2}` in order to benefit. """ mutable struct LongSequence{A <: Alphabet} <: BioSequence{A} - const data::Vector{UInt64} # encoded character sequence data + data::Memory{UInt64} # encoded character sequence data len::UInt - function LongSequence{A}(data::Vector{UInt64}, len::UInt) where {A <: Alphabet} + function LongSequence{A}(data::Memory{UInt64}, len::UInt) where {A <: Alphabet} new{A}(data, len) end end diff --git a/src/longsequences/randseq.jl b/src/longsequences/randseq.jl index 8938508b..388181d7 100644 --- a/src/longsequences/randseq.jl +++ b/src/longsequences/randseq.jl @@ -165,8 +165,9 @@ end # the non-ambiguous ones function Random.rand!(rng::AbstractRNG, seq::LongSequence{<:NucleicAcidAlphabet{4}}) data = seq.data - rand!(rng, data) - @inbounds for i in eachindex(data) + len = seq_data_len(seq) + rand!(rng, view(data, 1:len)) + @inbounds for i in 1:len nuc = 0x1111111111111111 mask = data[i] nuc = ((nuc & mask) << 1) | (nuc & ~mask) diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl index 6f17dc31..b182b8ec 100644 --- a/src/longsequences/seqview.jl +++ b/src/longsequences/seqview.jl @@ -20,11 +20,11 @@ AG ``` """ struct LongSubSeq{A<:Alphabet} <: BioSequence{A} - data::Vector{UInt64} + data::Memory{UInt64} part::UnitRange{Int} # Added to reduce method ambiguities - LongSubSeq{A}(data::Vector{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part) + LongSubSeq{A}(data::Memory{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part) end # These unions are significant because LongSubSeq and LongSequence have the same diff --git a/src/longsequences/transformations.jl b/src/longsequences/transformations.jl index f1e78178..ba16235e 100644 --- a/src/longsequences/transformations.jl +++ b/src/longsequences/transformations.jl @@ -2,6 +2,15 @@ ### LongSequence specific specializations of src/biosequence/transformations.jl ### +@noinline function resize_memory!(seq::LongSequence, n_chunks::UInt) + oldmem = seq.data + newmem = Memory{UInt64}(undef, n_chunks % Int) + unsafe_copyto!(newmem, 1, oldmem, 1, min(seq_data_len(seq), n_chunks)) + seq.data = newmem + seq +end + +# TODO for new breaking version: Do not allow this API, since we can have invalid symbols in encoding? """ resize!(seq, size, [force::Bool]) @@ -9,15 +18,13 @@ Resize a biological sequence `seq`, to a given `size`. Does not resize the under array unless the new size does not fit. If `force`, always resize underlying data array. """ function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A} - if size < 0 - throw(ArgumentError("size must be non-negative")) - else - if force | (seq_data_len(A, size) > seq_data_len(A, length(seq))) - resize!(seq.data, seq_data_len(A, size)) - end - seq.len = size - return seq + size < 0 && throw(ArgumentError("size must be non-negative")) + usize = UInt(size)::UInt + if force || (seq_data_len(A, usize) > seq_data_len(A, length(seq) % UInt)) + @noinline resize_memory!(seq, seq_data_len(A, usize) % UInt) end + seq.len = size + return seq end """ @@ -92,7 +99,7 @@ end # Reverse chunks in data vector and each symbol within a chunk. Chunks may have nonzero # offset after use, so use zero_offset! -@inline function reverse_data!(pred, data::Vector{UInt64}, len::UInt, B::BT) where { +@inline function reverse_data!(pred, data::Memory{UInt64}, len::UInt, B::BT) where { BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}} @inbounds @simd ivdep for i in 1:len >>> 1 data[i], data[len-i+1] = pred(reversebits(data[len-i+1], B)), pred(reversebits(data[i], B)) @@ -102,7 +109,7 @@ end end end -@inline function reverse_data_copy!(pred, dst::Vector{UInt64}, src::Vector{UInt64}, len::UInt, +@inline function reverse_data_copy!(pred, dst::Memory{UInt64}, src::Memory{UInt64}, len::UInt, B::BT) where {BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}} @inbounds @simd for i in eachindex(dst) dst[i] = pred(reversebits(src[len - i + 1], B)) @@ -116,7 +123,7 @@ Make a complement sequence of `seq` in place. """ function complement!(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet} seqdata = seq.data - @inbounds for i in eachindex(seqdata) + @inbounds for i in 1:seq_data_len(seq) seqdata[i] = complement_bitpar(seqdata[i], Alphabet(seq)) end return seq