From 06b54a619f6af682ea97962c911ae966842e4bea Mon Sep 17 00:00:00 2001
From: Jakob Nybo Nissen <jakobnybonissen@gmail.com>
Date: Mon, 21 Oct 2024 14:33:59 +0200
Subject: [PATCH] Make LongSequence and LongSubSeq use Memory

This makes these two data types more lightweight, requiring only two memory
allocations (one for the sequence and one for its memory) as opposed to two.
It also improves data locality since loading from the sequences won't require
a double load through the indirection of `Vector`.

The disadvanges are twofold:
First, we lose support for Julia 1.10 since Memory was introduced in Julia 1.11

Second, Vector's code to grow its underlying memory is much more optimised and
tested than the manual implementation here in BioSequences. This would matter
if users did a lot of resizing operations on biosequences, like `push!` or so.
However, I think they don't - they are instead much more likely to create a lot
of sequences.
If necessary, we can always implement better resizing / growth behaviour for
these types.
---
 .github/workflows/UnitTests.yml      |  2 +-
 Project.toml                         |  2 +-
 src/longsequences/constructors.jl    | 10 +++++-----
 src/longsequences/copying.jl         | 11 ++++++++---
 src/longsequences/longsequence.jl    |  4 ++--
 src/longsequences/randseq.jl         |  5 +++--
 src/longsequences/seqview.jl         |  4 ++--
 src/longsequences/transformations.jl | 29 +++++++++++++++++-----------
 8 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml
index d8ee9086..e4162cea 100644
--- a/.github/workflows/UnitTests.yml
+++ b/.github/workflows/UnitTests.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        julia-version: ['1', '1.10']
+        julia-version: ['1', '1.11']
         os: [ubuntu-latest, macOS-latest, windows-latest]
         experimental: [false]
         include:
diff --git a/Project.toml b/Project.toml
index 42d23282..81a933a4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,7 +15,7 @@ PrecompileTools = "1"
 Random = "1.5"
 StableRNGs = "0.1, 1.0"
 Twiddle = "1.1.1"
-julia = "1.10"
+julia = "1.11"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
index abdd2f94..ecab51c9 100644
--- a/src/longsequences/constructors.jl
+++ b/src/longsequences/constructors.jl
@@ -9,22 +9,22 @@
 
 @inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s))
 
-@inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet
+@inline function seq_data_len(::Type{A}, len::Integer)::Int where A <: Alphabet
     iszero(bits_per_symbol(A())) && return 0
-    return cld(len, div(64, bits_per_symbol(A())))
+    return cld(len % UInt, div(64, bits_per_symbol(A())) % UInt) % Int
 end
 
 function LongSequence{A}(::UndefInitializer, len::Integer) where {A<:Alphabet}
     if len < 0
         throw(ArgumentError("len must be non-negative"))
     end
-    return LongSequence{A}(Vector{UInt64}(undef, seq_data_len(A, len)), UInt(len))
+    return LongSequence{A}(Memory{UInt64}(undef, seq_data_len(A, len)), UInt(len))
 end
 
 # Generic constructor
 function LongSequence{A}(it) where {A <: Alphabet}
     len = length(it)
-    data = Vector{UInt64}(undef, seq_data_len(A, len))
+    data = Memory{UInt64}(undef, seq_data_len(A, len))
     bits = zero(UInt)
     bitind = bitindex(BitsPerSymbol(A()), encoded_data_eltype(LongSequence{A}), 1)
     @inbounds for x in it
@@ -41,7 +41,7 @@ function LongSequence{A}(it) where {A <: Alphabet}
     LongSequence{A}(data, len % UInt)
 end
 
-Base.empty(::Type{T}) where {T <: LongSequence} = T(UInt[], UInt(0))
+Base.empty(::Type{T}) where {T <: LongSequence} = T(Memory{UInt64}(), UInt(0))
 (::Type{T})() where {T <: LongSequence} = empty(T)
 
 # Constructors from other sequences
diff --git a/src/longsequences/copying.jl b/src/longsequences/copying.jl
index b98e0360..45e6ebf0 100644
--- a/src/longsequences/copying.jl
+++ b/src/longsequences/copying.jl
@@ -34,8 +34,12 @@ function Base.copy!(dst::SeqOrView{<:NucleicAcidAlphabet{N}},
 end
 
 function _copy!(dst::LongSequence, src::LongSequence)
-    resize!(dst.data, length(src.data))
-    copyto!(dst.data, src.data)
+    src_data_len = seq_data_len(src)
+    if length(dst.data) ≥ src_data_len
+        unsafe_copyto!(dst.data, 1, src.data, 1, src_data_len)
+    else
+        dst.data = copy(src.data)
+    end
     dst.len = src.len
     return dst
 end
@@ -48,10 +52,11 @@ function _copy!(dst::SeqOrView{A}, src::SeqOrView) where {A <: Alphabet}
 	end
 	if dst.data === src.data
 		longseq = LongSequence{A}(src)
-		src_ = LongSubSeq{A}(longseq.data, 1:length(longseq))
+		src_ = src isa LongSequence ? longseq : LongSubSeq{A}(longseq.data, 1:length(longseq))
 	else
 		src_ = src
 	end
+    typeof(src) == typeof(src_) || error() # unreachable
 	return copyto!(dst, 1, src_, 1, length(src))
 end
 
diff --git a/src/longsequences/longsequence.jl b/src/longsequences/longsequence.jl
index e18f997b..1909cf1a 100644
--- a/src/longsequences/longsequence.jl
+++ b/src/longsequences/longsequence.jl
@@ -84,10 +84,10 @@ The same applies with `LongSequence{RNAAlphabet{4}}`, simply replace the alphabe
 parameter with `RNAAlphabet{2}` in order to benefit.
 """
 mutable struct LongSequence{A <: Alphabet} <: BioSequence{A}
-    const data::Vector{UInt64}  # encoded character sequence data
+    data::Memory{UInt64}  # encoded character sequence data
     len::UInt
 
-    function LongSequence{A}(data::Vector{UInt64}, len::UInt) where {A <: Alphabet}
+    function LongSequence{A}(data::Memory{UInt64}, len::UInt) where {A <: Alphabet}
         new{A}(data, len)
     end
 end
diff --git a/src/longsequences/randseq.jl b/src/longsequences/randseq.jl
index 8938508b..388181d7 100644
--- a/src/longsequences/randseq.jl
+++ b/src/longsequences/randseq.jl
@@ -165,8 +165,9 @@ end
 # the non-ambiguous ones
 function Random.rand!(rng::AbstractRNG, seq::LongSequence{<:NucleicAcidAlphabet{4}})
     data = seq.data
-    rand!(rng, data)
-    @inbounds for i in eachindex(data)
+    len = seq_data_len(seq)
+    rand!(rng, view(data, 1:len))
+    @inbounds for i in 1:len
         nuc = 0x1111111111111111
         mask = data[i]
         nuc = ((nuc & mask) << 1) | (nuc & ~mask)
diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl
index 6f17dc31..b182b8ec 100644
--- a/src/longsequences/seqview.jl
+++ b/src/longsequences/seqview.jl
@@ -20,11 +20,11 @@ AG
 ```
 """
 struct LongSubSeq{A<:Alphabet} <: BioSequence{A}
-    data::Vector{UInt64}
+    data::Memory{UInt64}
     part::UnitRange{Int}
 
 	# Added to reduce method ambiguities
-	LongSubSeq{A}(data::Vector{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part)
+	LongSubSeq{A}(data::Memory{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part)
 end
 
 # These unions are significant because LongSubSeq and LongSequence have the same
diff --git a/src/longsequences/transformations.jl b/src/longsequences/transformations.jl
index f1e78178..ba16235e 100644
--- a/src/longsequences/transformations.jl
+++ b/src/longsequences/transformations.jl
@@ -2,6 +2,15 @@
 ### LongSequence specific specializations of src/biosequence/transformations.jl
 ###
 
+@noinline function resize_memory!(seq::LongSequence, n_chunks::UInt)
+    oldmem = seq.data
+    newmem = Memory{UInt64}(undef, n_chunks % Int)
+    unsafe_copyto!(newmem, 1, oldmem, 1, min(seq_data_len(seq), n_chunks))
+    seq.data = newmem
+    seq
+end
+
+# TODO for new breaking version: Do not allow this API, since we can have invalid symbols in encoding?
 """
     resize!(seq, size, [force::Bool])
 
@@ -9,15 +18,13 @@ Resize a biological sequence `seq`, to a given `size`. Does not resize the under
 array unless the new size does not fit. If `force`, always resize underlying data array.
 """
 function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A}
-    if size < 0
-        throw(ArgumentError("size must be non-negative"))
-    else
-        if force | (seq_data_len(A, size) > seq_data_len(A, length(seq)))
-            resize!(seq.data, seq_data_len(A, size))
-        end
-        seq.len = size
-        return seq
+    size < 0 && throw(ArgumentError("size must be non-negative"))
+    usize = UInt(size)::UInt
+    if force || (seq_data_len(A, usize) > seq_data_len(A, length(seq) % UInt))
+        @noinline resize_memory!(seq, seq_data_len(A, usize) % UInt)
     end
+    seq.len = size
+    return seq
 end
 
 """
@@ -92,7 +99,7 @@ end
 
 # Reverse chunks in data vector and each symbol within a chunk. Chunks may have nonzero
 # offset after use, so use zero_offset!
-@inline function reverse_data!(pred, data::Vector{UInt64}, len::UInt, B::BT) where {
+@inline function reverse_data!(pred, data::Memory{UInt64}, len::UInt, B::BT) where {
     BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}}
     @inbounds @simd ivdep for i in 1:len >>> 1
         data[i], data[len-i+1] = pred(reversebits(data[len-i+1], B)), pred(reversebits(data[i], B))
@@ -102,7 +109,7 @@ end
     end
 end
 
-@inline function reverse_data_copy!(pred, dst::Vector{UInt64}, src::Vector{UInt64}, len::UInt,
+@inline function reverse_data_copy!(pred, dst::Memory{UInt64}, src::Memory{UInt64}, len::UInt,
     B::BT) where {BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}}
     @inbounds @simd for i in eachindex(dst)
         dst[i] = pred(reversebits(src[len - i + 1], B))
@@ -116,7 +123,7 @@ Make a complement sequence of `seq` in place.
 """
 function complement!(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet}
     seqdata = seq.data
-    @inbounds for i in eachindex(seqdata)
+    @inbounds for i in 1:seq_data_len(seq)
         seqdata[i] = complement_bitpar(seqdata[i], Alphabet(seq))
     end
     return seq