BioJulia · jakobnissen · Oct 21, 2024
diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        julia-version: ['1', '1.10']
+        julia-version: ['1', '1.11']
         os: [ubuntu-latest, macOS-latest, windows-latest]
         experimental: [false]
         include:

diff --git a/Project.toml b/Project.toml
@@ -15,7 +15,7 @@ PrecompileTools = "1"
 Random = "1.5"
 StableRNGs = "0.1, 1.0"
 Twiddle = "1.1.1"
-julia = "1.10"
+julia = "1.11"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
@@ -9,22 +9,22 @@
 
 @inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s))
 
-@inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet
+@inline function seq_data_len(::Type{A}, len::Integer)::Int where A <: Alphabet
     iszero(bits_per_symbol(A())) && return 0
-    return cld(len, div(64, bits_per_symbol(A())))
+    return cld(len % UInt, div(64, bits_per_symbol(A())) % UInt) % Int
 end
 
 function LongSequence{A}(::UndefInitializer, len::Integer) where {A<:Alphabet}
     if len < 0
         throw(ArgumentError("len must be non-negative"))
     end
-    return LongSequence{A}(Vector{UInt64}(undef, seq_data_len(A, len)), UInt(len))
+    return LongSequence{A}(Memory{UInt64}(undef, seq_data_len(A, len)), UInt(len))
 end
 
 # Generic constructor
 function LongSequence{A}(it) where {A <: Alphabet}
     len = length(it)
-    data = Vector{UInt64}(undef, seq_data_len(A, len))
+    data = Memory{UInt64}(undef, seq_data_len(A, len))
     bits = zero(UInt)
     bitind = bitindex(BitsPerSymbol(A()), encoded_data_eltype(LongSequence{A}), 1)
     @inbounds for x in it
@@ -41,7 +41,7 @@ function LongSequence{A}(it) where {A <: Alphabet}
     LongSequence{A}(data, len % UInt)
 end
 
-Base.empty(::Type{T}) where {T <: LongSequence} = T(UInt[], UInt(0))
+Base.empty(::Type{T}) where {T <: LongSequence} = T(Memory{UInt64}(), UInt(0))
 (::Type{T})() where {T <: LongSequence} = empty(T)
 
 # Constructors from other sequences

diff --git a/src/longsequences/copying.jl b/src/longsequences/copying.jl
@@ -34,8 +34,12 @@ function Base.copy!(dst::SeqOrView{<:NucleicAcidAlphabet{N}},
 end
 
 function _copy!(dst::LongSequence, src::LongSequence)
-    resize!(dst.data, length(src.data))
-    copyto!(dst.data, src.data)
+    src_data_len = seq_data_len(src)
+    if length(dst.data) ≥ src_data_len
+        unsafe_copyto!(dst.data, 1, src.data, 1, src_data_len)
+    else
+        dst.data = copy(src.data)
+    end
     dst.len = src.len
     return dst
 end
@@ -48,10 +52,11 @@ function _copy!(dst::SeqOrView{A}, src::SeqOrView) where {A <: Alphabet}
 	end
 	if dst.data === src.data
 		longseq = LongSequence{A}(src)
-		src_ = LongSubSeq{A}(longseq.data, 1:length(longseq))
+		src_ = src isa LongSequence ? longseq : LongSubSeq{A}(longseq.data, 1:length(longseq))
 	else
 		src_ = src
 	end
+    typeof(src) == typeof(src_) || error() # unreachable
 	return copyto!(dst, 1, src_, 1, length(src))
 end
 

diff --git a/src/longsequences/longsequence.jl b/src/longsequences/longsequence.jl
@@ -84,10 +84,10 @@ The same applies with `LongSequence{RNAAlphabet{4}}`, simply replace the alphabe
 parameter with `RNAAlphabet{2}` in order to benefit.
 """
 mutable struct LongSequence{A <: Alphabet} <: BioSequence{A}
-    const data::Vector{UInt64}  # encoded character sequence data
+    data::Memory{UInt64}  # encoded character sequence data
     len::UInt
 
-    function LongSequence{A}(data::Vector{UInt64}, len::UInt) where {A <: Alphabet}
+    function LongSequence{A}(data::Memory{UInt64}, len::UInt) where {A <: Alphabet}
         new{A}(data, len)
     end
 end

diff --git a/src/longsequences/randseq.jl b/src/longsequences/randseq.jl
@@ -165,8 +165,9 @@ end
 # the non-ambiguous ones
 function Random.rand!(rng::AbstractRNG, seq::LongSequence{<:NucleicAcidAlphabet{4}})
     data = seq.data
-    rand!(rng, data)
-    @inbounds for i in eachindex(data)
+    len = seq_data_len(seq)
+    rand!(rng, view(data, 1:len))
+    @inbounds for i in 1:len
         nuc = 0x1111111111111111
         mask = data[i]
         nuc = ((nuc & mask) << 1) | (nuc & ~mask)

diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl
@@ -20,11 +20,11 @@ AG
 ```
 """
 struct LongSubSeq{A<:Alphabet} <: BioSequence{A}
-    data::Vector{UInt64}
+    data::Memory{UInt64}
     part::UnitRange{Int}
 
 	# Added to reduce method ambiguities
-	LongSubSeq{A}(data::Vector{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part)
+	LongSubSeq{A}(data::Memory{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part)
 end
 
 # These unions are significant because LongSubSeq and LongSequence have the same

diff --git a/src/longsequences/transformations.jl b/src/longsequences/transformations.jl
@@ -2,22 +2,29 @@
 ### LongSequence specific specializations of src/biosequence/transformations.jl
 ###
 
+@noinline function resize_memory!(seq::LongSequence, n_chunks::UInt)
+    oldmem = seq.data
+    newmem = Memory{UInt64}(undef, n_chunks % Int)
+    unsafe_copyto!(newmem, 1, oldmem, 1, min(seq_data_len(seq), n_chunks))
+    seq.data = newmem
+    seq
+end
+
+# TODO for new breaking version: Do not allow this API, since we can have invalid symbols in encoding?
 """
     resize!(seq, size, [force::Bool])
 
 Resize a biological sequence `seq`, to a given `size`. Does not resize the underlying data
 array unless the new size does not fit. If `force`, always resize underlying data array.
 """
 function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A}
-    if size < 0
-        throw(ArgumentError("size must be non-negative"))
-    else
-        if force | (seq_data_len(A, size) > seq_data_len(A, length(seq)))
-            resize!(seq.data, seq_data_len(A, size))
-        end
-        seq.len = size
-        return seq
+    size < 0 && throw(ArgumentError("size must be non-negative"))
+    usize = UInt(size)::UInt
+    if force || (seq_data_len(A, usize) > seq_data_len(A, length(seq) % UInt))
+        @noinline resize_memory!(seq, seq_data_len(A, usize) % UInt)
     end
+    seq.len = size
+    return seq
 end
 
 """
@@ -92,7 +99,7 @@ end
 
 # Reverse chunks in data vector and each symbol within a chunk. Chunks may have nonzero
 # offset after use, so use zero_offset!
-@inline function reverse_data!(pred, data::Vector{UInt64}, len::UInt, B::BT) where {
+@inline function reverse_data!(pred, data::Memory{UInt64}, len::UInt, B::BT) where {
     BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}}
     @inbounds @simd ivdep for i in 1:len >>> 1
         data[i], data[len-i+1] = pred(reversebits(data[len-i+1], B)), pred(reversebits(data[i], B))
@@ -102,7 +109,7 @@ end
     end
 end
 
-@inline function reverse_data_copy!(pred, dst::Vector{UInt64}, src::Vector{UInt64}, len::UInt,
+@inline function reverse_data_copy!(pred, dst::Memory{UInt64}, src::Memory{UInt64}, len::UInt,
     B::BT) where {BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}}
     @inbounds @simd for i in eachindex(dst)
         dst[i] = pred(reversebits(src[len - i + 1], B))
@@ -116,7 +123,7 @@ Make a complement sequence of `seq` in place.
 """
 function complement!(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet}
     seqdata = seq.data
-    @inbounds for i in eachindex(seqdata)
+    @inbounds for i in 1:seq_data_len(seq)
         seqdata[i] = complement_bitpar(seqdata[i], Alphabet(seq))
     end
     return seq