diff --git a/isovar/__init__.py b/isovar/__init__.py index c034439..0257a83 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.4.23" +__version__ = "1.4.24" from .allele_read import AlleleRead diff --git a/isovar/assembly.py b/isovar/assembly.py index 3373315..ed2fc52 100644 --- a/isovar/assembly.py +++ b/isovar/assembly.py @@ -37,6 +37,62 @@ logger = get_logger(__name__) +DEFAULT_ASSEMBLY_KMER_SIZE = 15 + + +def _sequence_kmers(sequence, k): + return { + sequence[i:i + k] + for i in range(len(sequence) - k + 1) + } + + +def _greedy_merge_candidate_pairs( + variant_sequences, + min_overlap_size=MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE, + max_kmer_size=DEFAULT_ASSEMBLY_KMER_SIZE): + """ + Use an exact k-mer index to skip obviously impossible pairwise comparisons. + + Any truly mergeable pair must share at least one exact k-mer whose length + is no greater than the overlap threshold and no greater than the shortest + eligible sequence. Final merge eligibility is still decided by + VariantSequence.combine. + """ + if len(variant_sequences) < 2: + return [] + + eligible_lengths = [ + len(variant_sequence) + for variant_sequence in variant_sequences + if len(variant_sequence) >= min_overlap_size + ] + if not eligible_lengths: + return [] + + k = max(1, min(max_kmer_size, min_overlap_size, min(eligible_lengths))) + kmer_to_indices = defaultdict(set) + sequence_kmers = [] + + for i, variant_sequence in enumerate(variant_sequences): + if len(variant_sequence) < min_overlap_size: + kmers = () + else: + kmers = tuple(_sequence_kmers(variant_sequence.sequence, k)) + for kmer in kmers: + kmer_to_indices[kmer].add(i) + sequence_kmers.append(kmers) + + candidate_pairs = set() + for i, kmers in enumerate(sequence_kmers): + candidate_indices = set() + for kmer in kmers: + candidate_indices.update(kmer_to_indices[kmer]) + for j in candidate_indices: + if j > i: + candidate_pairs.add((i, j)) + return sorted(candidate_pairs) + def greedy_merge_helper( variant_sequences, @@ -51,16 +107,17 @@ def greedy_merge_helper( were successfully merged. """ candidates = [] - for i in range(len(variant_sequences)): - for j in range(i + 1, len(variant_sequences)): - combined = variant_sequences[i].combine( - variant_sequences[j], min_overlap_size=min_overlap_size) - if combined is not None: - overlap = ( - len(variant_sequences[i]) - + len(variant_sequences[j]) - - len(combined)) - candidates.append((overlap, len(combined.reads), i, j, combined)) + for i, j in _greedy_merge_candidate_pairs( + variant_sequences, + min_overlap_size=min_overlap_size): + combined = variant_sequences[i].combine( + variant_sequences[j], min_overlap_size=min_overlap_size) + if combined is not None: + overlap = ( + len(variant_sequences[i]) + + len(variant_sequences[j]) + - len(combined)) + candidates.append((overlap, len(combined.reads), i, j, combined)) if not candidates: return list(variant_sequences), False diff --git a/tests/test_assembly.py b/tests/test_assembly.py index 71005f7..f8381cb 100644 --- a/tests/test_assembly.py +++ b/tests/test_assembly.py @@ -20,6 +20,7 @@ from isovar.assembly import ( iterative_overlap_assembly, greedy_merge, + greedy_merge_helper, collapse_substrings ) @@ -365,6 +366,36 @@ def test_greedy_merge_deterministic_across_input_orders(): trial, reference_all_reads, result_all_reads) +def test_greedy_merge_helper_skips_impossible_pairs(monkeypatch): + variant_sequences = [ + VariantSequence(prefix="AAAAAA", alt="X", suffix="AAAAAA", reads={"a"}), + VariantSequence(prefix="CCCCCC", alt="X", suffix="CCCCCC", reads={"c"}), + VariantSequence(prefix="GGGGGG", alt="X", suffix="GGGGGG", reads={"g"}), + VariantSequence(prefix="TTTTTT", alt="X", suffix="TTTTTT", reads={"t"}), + ] + + combine_call_count = 0 + original_combine = VariantSequence.combine + + def counting_combine(self, other_sequence, min_overlap_size=1): + nonlocal combine_call_count + combine_call_count += 1 + return original_combine( + self, + other_sequence, + min_overlap_size=min_overlap_size) + + monkeypatch.setattr(VariantSequence, "combine", counting_combine) + + result, merged_any = greedy_merge_helper( + variant_sequences, + min_overlap_size=3) + + eq_(combine_call_count, 0) + eq_(merged_any, False) + eq_(result, variant_sequences) + + def test_greedy_merge_prefers_larger_overlap(): """ When a sequence could merge with two partners, the larger overlap