Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion isovar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "1.4.23"
__version__ = "1.4.24"


from .allele_read import AlleleRead
Expand Down
77 changes: 67 additions & 10 deletions isovar/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,62 @@

logger = get_logger(__name__)

DEFAULT_ASSEMBLY_KMER_SIZE = 15


def _sequence_kmers(sequence, k):
return {
sequence[i:i + k]
for i in range(len(sequence) - k + 1)
}


def _greedy_merge_candidate_pairs(
variant_sequences,
min_overlap_size=MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE,
max_kmer_size=DEFAULT_ASSEMBLY_KMER_SIZE):
"""
Use an exact k-mer index to skip obviously impossible pairwise comparisons.

Any truly mergeable pair must share at least one exact k-mer whose length
is no greater than the overlap threshold and no greater than the shortest
eligible sequence. Final merge eligibility is still decided by
VariantSequence.combine.
"""
if len(variant_sequences) < 2:
return []

eligible_lengths = [
len(variant_sequence)
for variant_sequence in variant_sequences
if len(variant_sequence) >= min_overlap_size
]
if not eligible_lengths:
return []

k = max(1, min(max_kmer_size, min_overlap_size, min(eligible_lengths)))
kmer_to_indices = defaultdict(set)
sequence_kmers = []

for i, variant_sequence in enumerate(variant_sequences):
if len(variant_sequence) < min_overlap_size:
kmers = ()
else:
kmers = tuple(_sequence_kmers(variant_sequence.sequence, k))
for kmer in kmers:
kmer_to_indices[kmer].add(i)
sequence_kmers.append(kmers)

candidate_pairs = set()
for i, kmers in enumerate(sequence_kmers):
candidate_indices = set()
for kmer in kmers:
candidate_indices.update(kmer_to_indices[kmer])
for j in candidate_indices:
if j > i:
candidate_pairs.add((i, j))
return sorted(candidate_pairs)


def greedy_merge_helper(
variant_sequences,
Expand All @@ -51,16 +107,17 @@ def greedy_merge_helper(
were successfully merged.
"""
candidates = []
for i in range(len(variant_sequences)):
for j in range(i + 1, len(variant_sequences)):
combined = variant_sequences[i].combine(
variant_sequences[j], min_overlap_size=min_overlap_size)
if combined is not None:
overlap = (
len(variant_sequences[i])
+ len(variant_sequences[j])
- len(combined))
candidates.append((overlap, len(combined.reads), i, j, combined))
for i, j in _greedy_merge_candidate_pairs(
variant_sequences,
min_overlap_size=min_overlap_size):
combined = variant_sequences[i].combine(
variant_sequences[j], min_overlap_size=min_overlap_size)
if combined is not None:
overlap = (
len(variant_sequences[i])
+ len(variant_sequences[j])
- len(combined))
candidates.append((overlap, len(combined.reads), i, j, combined))

if not candidates:
return list(variant_sequences), False
Expand Down
31 changes: 31 additions & 0 deletions tests/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from isovar.assembly import (
iterative_overlap_assembly,
greedy_merge,
greedy_merge_helper,
collapse_substrings
)

Expand Down Expand Up @@ -365,6 +366,36 @@ def test_greedy_merge_deterministic_across_input_orders():
trial, reference_all_reads, result_all_reads)


def test_greedy_merge_helper_skips_impossible_pairs(monkeypatch):
variant_sequences = [
VariantSequence(prefix="AAAAAA", alt="X", suffix="AAAAAA", reads={"a"}),
VariantSequence(prefix="CCCCCC", alt="X", suffix="CCCCCC", reads={"c"}),
VariantSequence(prefix="GGGGGG", alt="X", suffix="GGGGGG", reads={"g"}),
VariantSequence(prefix="TTTTTT", alt="X", suffix="TTTTTT", reads={"t"}),
]

combine_call_count = 0
original_combine = VariantSequence.combine

def counting_combine(self, other_sequence, min_overlap_size=1):
nonlocal combine_call_count
combine_call_count += 1
return original_combine(
self,
other_sequence,
min_overlap_size=min_overlap_size)

monkeypatch.setattr(VariantSequence, "combine", counting_combine)

result, merged_any = greedy_merge_helper(
variant_sequences,
min_overlap_size=3)

eq_(combine_call_count, 0)
eq_(merged_any, False)
eq_(result, variant_sequences)


def test_greedy_merge_prefers_larger_overlap():
"""
When a sequence could merge with two partners, the larger overlap
Expand Down
Loading