Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP Make LongSequence and LongSubSeq use Memory [DO NOT MERGE] #317

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/UnitTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
strategy:
fail-fast: false
matrix:
julia-version: ['1', '1.10']
julia-version: ['1', '1.11']
os: [ubuntu-latest, macOS-latest, windows-latest]
experimental: [false]
include:
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ PrecompileTools = "1"
Random = "1.5"
StableRNGs = "0.1, 1.0"
Twiddle = "1.1.1"
julia = "1.10"
julia = "1.11"

[extras]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Expand Down
10 changes: 5 additions & 5 deletions src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,22 @@

@inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s))

@inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet
@inline function seq_data_len(::Type{A}, len::Integer)::Int where A <: Alphabet
iszero(bits_per_symbol(A())) && return 0
return cld(len, div(64, bits_per_symbol(A())))
return cld(len % UInt, div(64, bits_per_symbol(A())) % UInt) % Int
end

function LongSequence{A}(::UndefInitializer, len::Integer) where {A<:Alphabet}
if len < 0
throw(ArgumentError("len must be non-negative"))
end
return LongSequence{A}(Vector{UInt64}(undef, seq_data_len(A, len)), UInt(len))
return LongSequence{A}(Memory{UInt64}(undef, seq_data_len(A, len)), UInt(len))
end

# Generic constructor
function LongSequence{A}(it) where {A <: Alphabet}
len = length(it)
data = Vector{UInt64}(undef, seq_data_len(A, len))
data = Memory{UInt64}(undef, seq_data_len(A, len))
bits = zero(UInt)
bitind = bitindex(BitsPerSymbol(A()), encoded_data_eltype(LongSequence{A}), 1)
@inbounds for x in it
Expand All @@ -41,7 +41,7 @@ function LongSequence{A}(it) where {A <: Alphabet}
LongSequence{A}(data, len % UInt)
end

Base.empty(::Type{T}) where {T <: LongSequence} = T(UInt[], UInt(0))
Base.empty(::Type{T}) where {T <: LongSequence} = T(Memory{UInt64}(), UInt(0))
(::Type{T})() where {T <: LongSequence} = empty(T)

# Constructors from other sequences
Expand Down
11 changes: 8 additions & 3 deletions src/longsequences/copying.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,12 @@ function Base.copy!(dst::SeqOrView{<:NucleicAcidAlphabet{N}},
end

function _copy!(dst::LongSequence, src::LongSequence)
resize!(dst.data, length(src.data))
copyto!(dst.data, src.data)
src_data_len = seq_data_len(src)
if length(dst.data) ≥ src_data_len
unsafe_copyto!(dst.data, 1, src.data, 1, src_data_len)
else
dst.data = copy(src.data)
end
dst.len = src.len
return dst
end
Expand All @@ -48,10 +52,11 @@ function _copy!(dst::SeqOrView{A}, src::SeqOrView) where {A <: Alphabet}
end
if dst.data === src.data
longseq = LongSequence{A}(src)
src_ = LongSubSeq{A}(longseq.data, 1:length(longseq))
src_ = src isa LongSequence ? longseq : LongSubSeq{A}(longseq.data, 1:length(longseq))
else
src_ = src
end
typeof(src) == typeof(src_) || error() # unreachable
return copyto!(dst, 1, src_, 1, length(src))
end

Expand Down
4 changes: 2 additions & 2 deletions src/longsequences/longsequence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@ The same applies with `LongSequence{RNAAlphabet{4}}`, simply replace the alphabe
parameter with `RNAAlphabet{2}` in order to benefit.
"""
mutable struct LongSequence{A <: Alphabet} <: BioSequence{A}
const data::Vector{UInt64} # encoded character sequence data
data::Memory{UInt64} # encoded character sequence data
len::UInt

function LongSequence{A}(data::Vector{UInt64}, len::UInt) where {A <: Alphabet}
function LongSequence{A}(data::Memory{UInt64}, len::UInt) where {A <: Alphabet}
new{A}(data, len)
end
end
Expand Down
5 changes: 3 additions & 2 deletions src/longsequences/randseq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,9 @@ end
# the non-ambiguous ones
function Random.rand!(rng::AbstractRNG, seq::LongSequence{<:NucleicAcidAlphabet{4}})
data = seq.data
rand!(rng, data)
@inbounds for i in eachindex(data)
len = seq_data_len(seq)
rand!(rng, view(data, 1:len))
@inbounds for i in 1:len
nuc = 0x1111111111111111
mask = data[i]
nuc = ((nuc & mask) << 1) | (nuc & ~mask)
Expand Down
4 changes: 2 additions & 2 deletions src/longsequences/seqview.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ AG
```
"""
struct LongSubSeq{A<:Alphabet} <: BioSequence{A}
data::Vector{UInt64}
data::Memory{UInt64}
part::UnitRange{Int}

# Added to reduce method ambiguities
LongSubSeq{A}(data::Vector{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part)
LongSubSeq{A}(data::Memory{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part)
end

# These unions are significant because LongSubSeq and LongSequence have the same
Expand Down
29 changes: 18 additions & 11 deletions src/longsequences/transformations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,29 @@
### LongSequence specific specializations of src/biosequence/transformations.jl
###

@noinline function resize_memory!(seq::LongSequence, n_chunks::UInt)
oldmem = seq.data
newmem = Memory{UInt64}(undef, n_chunks % Int)
unsafe_copyto!(newmem, 1, oldmem, 1, min(seq_data_len(seq), n_chunks))
seq.data = newmem
seq
end

# TODO for new breaking version: Do not allow this API, since we can have invalid symbols in encoding?
"""
resize!(seq, size, [force::Bool])

Resize a biological sequence `seq`, to a given `size`. Does not resize the underlying data
array unless the new size does not fit. If `force`, always resize underlying data array.
"""
function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A}
if size < 0
throw(ArgumentError("size must be non-negative"))
else
if force | (seq_data_len(A, size) > seq_data_len(A, length(seq)))
resize!(seq.data, seq_data_len(A, size))
end
seq.len = size
return seq
size < 0 && throw(ArgumentError("size must be non-negative"))
usize = UInt(size)::UInt
if force || (seq_data_len(A, usize) > seq_data_len(A, length(seq) % UInt))
@noinline resize_memory!(seq, seq_data_len(A, usize) % UInt)
end
seq.len = size
return seq
end

"""
Expand Down Expand Up @@ -92,7 +99,7 @@ end

# Reverse chunks in data vector and each symbol within a chunk. Chunks may have nonzero
# offset after use, so use zero_offset!
@inline function reverse_data!(pred, data::Vector{UInt64}, len::UInt, B::BT) where {
@inline function reverse_data!(pred, data::Memory{UInt64}, len::UInt, B::BT) where {
BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}}
@inbounds @simd ivdep for i in 1:len >>> 1
data[i], data[len-i+1] = pred(reversebits(data[len-i+1], B)), pred(reversebits(data[i], B))
Expand All @@ -102,7 +109,7 @@ end
end
end

@inline function reverse_data_copy!(pred, dst::Vector{UInt64}, src::Vector{UInt64}, len::UInt,
@inline function reverse_data_copy!(pred, dst::Memory{UInt64}, src::Memory{UInt64}, len::UInt,
B::BT) where {BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}}
@inbounds @simd for i in eachindex(dst)
dst[i] = pred(reversebits(src[len - i + 1], B))
Expand All @@ -116,7 +123,7 @@ Make a complement sequence of `seq` in place.
"""
function complement!(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet}
seqdata = seq.data
@inbounds for i in eachindex(seqdata)
@inbounds for i in 1:seq_data_len(seq)
seqdata[i] = complement_bitpar(seqdata[i], Alphabet(seq))
end
return seq
Expand Down
Loading