diff --git a/isovar/__init__.py b/isovar/__init__.py index 2f21c0d..1a1bd83 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.5.0" +__version__ = "1.5.1" from .allele_read import AlleleRead diff --git a/isovar/allele_read.py b/isovar/allele_read.py index 3256397..30ac3fc 100644 --- a/isovar/allele_read.py +++ b/isovar/allele_read.py @@ -28,15 +28,20 @@ class AlleleRead(ValueObject): Extremely simplified representation of a read at a locus: just the allele at the locus and sequence before/after. We're ignoring the base qualities and any additional information about splicing, clipping or alignment. + + When overlapping mates from the same fragment are merged upstream, + `source_read_count` retains the number of raw reads represented by this + fragment-level allele observation. """ - __slots__ = ["prefix", "allele", "suffix", "name", "sequence"] + __slots__ = ["prefix", "allele", "suffix", "name", "sequence", "source_read_count"] - def __init__(self, prefix, allele, suffix, name): + def __init__(self, prefix, allele, suffix, name, source_read_count=1): self.prefix = prefix self.allele = allele self.suffix = suffix self.name = name self.sequence = prefix + allele + suffix + self.source_read_count = source_read_count def __len__(self): return len(self.sequence) @@ -95,4 +100,5 @@ def from_locus_read(cls, locus_read): prefix, nucleotides_at_variant_locus, suffix, - name=read_name) + name=read_name, + source_read_count=locus_read.source_read_count) diff --git a/isovar/dataframe_helpers.py b/isovar/dataframe_helpers.py index 13ff34f..0b95f8a 100644 --- a/isovar/dataframe_helpers.py +++ b/isovar/dataframe_helpers.py @@ -76,7 +76,11 @@ def allele_counts_dataframe(read_evidence_generator): return dataframe_from_generator( element_class=ReadEvidence, variant_and_elements_generator=read_evidence_generator, - # DataFrameBuilder will take the length of these fields' values + converters={ + "ref_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads), + "alt_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads), + "other_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads), + }, rename_dict={ "ref_reads": "num_ref_reads", "alt_reads": "num_alt_reads", @@ -99,6 +103,7 @@ def allele_reads_to_dataframe(variants_and_allele_reads): """ df_builder = DataFrameBuilder( AlleleRead, + exclude={"source_read_count"}, extra_column_fns={ "gene": lambda v, _: ";".join(v.gene_names), }) @@ -116,6 +121,7 @@ def locus_reads_dataframe(alignments, chromosome, base0_start, base0_end, *args, """ df_builder = DataFrameBuilder( LocusRead, + exclude={"source_read_count"}, variant_columns=False, converters={ "reference_positions": list_to_string, @@ -187,7 +193,10 @@ def translations_generator_to_dataframe(translations_generator): }, extra_column_fns={ "untrimmed_variant_sequence_read_count": ( - lambda _, t: len(t.untrimmed_variant_sequence.reads)), + lambda _, t: sum( + getattr(read, "source_read_count", 1) + for read in t.untrimmed_variant_sequence.reads + )), }) @@ -197,7 +206,12 @@ def read_evidence_generator_to_dataframe(read_evidence_generator): """ return dataframe_from_generator( element_class=ReadEvidence, - variant_and_elements_generator=read_evidence_generator) + variant_and_elements_generator=read_evidence_generator, + converters={ + "ref_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads), + "alt_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads), + "other_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads), + }) def isovar_results_to_dataframe(isovar_results): @@ -213,4 +227,4 @@ def isovar_results_to_dataframe(isovar_results): records = [] for isovar_result in isovar_results: records.append(isovar_result.to_record()) - return pd.DataFrame.from_records(records) \ No newline at end of file + return pd.DataFrame.from_records(records) diff --git a/isovar/isovar_result.py b/isovar/isovar_result.py index 348824a..4fcb77e 100644 --- a/isovar/isovar_result.py +++ b/isovar/isovar_result.py @@ -25,6 +25,11 @@ from .common import safediv from .alignment_score import alignment_score + +def _sum_source_read_count(reads): + return sum(getattr(read, "source_read_count", 1) for read in reads) + + class IsovarResult(object): """ This object represents all information gathered about a variant, @@ -983,7 +988,7 @@ def num_ref_reads(self): """ Number of reads which support the reference allele. """ - return len(self.ref_reads) + return _sum_source_read_count(self.ref_reads) @cached_property def num_ref_fragments(self): @@ -997,7 +1002,7 @@ def num_alt_reads(self): """ Number of reads which support the alt allele. """ - return len(self.alt_reads) + return _sum_source_read_count(self.alt_reads) @cached_property def num_alt_fragments(self): @@ -1011,7 +1016,7 @@ def num_other_reads(self): """ Number of reads which support neither the reference nor alt alleles. """ - return len(self.other_reads) + return _sum_source_read_count(self.other_reads) @cached_property def num_other_fragments(self): @@ -1147,4 +1152,4 @@ def num_phased_variants_in_protein_sequence(self): Returns int """ - return len(self.phased_variants_in_protein_sequence) \ No newline at end of file + return len(self.phased_variants_in_protein_sequence) diff --git a/isovar/locus_read.py b/isovar/locus_read.py index e6dc001..e4e9753 100644 --- a/isovar/locus_read.py +++ b/isovar/locus_read.py @@ -23,12 +23,17 @@ class LocusRead(ValueObject): """ Minimal set of information extracted from SAM/BAM alignment file at a particular locus to later figure out the allele at this locus. + + Overlapping paired-end reads from the same fragment may be merged into a + single LocusRead. In that case `source_read_count` records how many raw + alignments were collapsed into this fragment-level view. """ __slots__ = [ "name", "sequence", "reference_positions", "quality_scores", + "source_read_count", "reference_base0_start_inclusive", "reference_base0_end_exclusive", "read_base0_start_inclusive", @@ -44,7 +49,8 @@ def __init__( reference_base0_start_inclusive, reference_base0_end_exclusive, read_base0_start_inclusive, - read_base0_end_exclusive): + read_base0_end_exclusive, + source_read_count=1): """ Parameters ---------- @@ -61,6 +67,11 @@ def __init__( quality_scores : array of int Base qualities for every character in the sequence + source_read_count : int + Number of raw reads represented by this LocusRead. Usually 1, but + overlapping paired-end mates from the same fragment may be merged + into a single LocusRead while keeping this count. + reference_base0_start_inclusive : int Start index of reference locus which is overlapped by this read (base 0, inclusive) @@ -134,9 +145,8 @@ def __init__( self.sequence = sequence self.reference_positions = reference_positions self.quality_scores = quality_scores + self.source_read_count = source_read_count self.reference_base0_start_inclusive = reference_base0_start_inclusive self.reference_base0_end_exclusive = reference_base0_end_exclusive self.read_base0_start_inclusive = read_base0_start_inclusive self.read_base0_end_exclusive = read_base0_end_exclusive - - diff --git a/isovar/main.py b/isovar/main.py index e22648c..c7720b6 100644 --- a/isovar/main.py +++ b/isovar/main.py @@ -147,7 +147,7 @@ def run_isovar( threads=decompression_threads) if read_collector is None: - read_collector = ReadCollector() + read_collector = ReadCollector(merge_overlapping_fragments=True) if protein_sequence_creator is None: protein_sequence_creator = ProteinSequenceCreator() diff --git a/isovar/protein_sequence.py b/isovar/protein_sequence.py index 995f702..1be6fe5 100644 --- a/isovar/protein_sequence.py +++ b/isovar/protein_sequence.py @@ -25,6 +25,10 @@ logger = get_logger(__name__) +def _sum_source_read_count(reads): + return sum(getattr(read, "source_read_count", 1) for read in reads) + + class ProteinSequence(TranslationKey): """ Translated amino acid sequence aggregated across possibly multiple @@ -181,7 +185,7 @@ def num_supporting_reads(self): Returns int """ - return len(self.supporting_reads) + return _sum_source_read_count(self.supporting_reads) @property def num_mismatches_before_variant(self): diff --git a/isovar/read_collector.py b/isovar/read_collector.py index fe210ec..875887f 100644 --- a/isovar/read_collector.py +++ b/isovar/read_collector.py @@ -10,6 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import defaultdict + from .default_parameters import ( USE_SECONDARY_ALIGNMENTS, USE_DUPLICATE_READS, @@ -37,6 +39,7 @@ def __init__( use_duplicate_reads=USE_DUPLICATE_READS, min_mapping_quality=MIN_READ_MAPPING_QUALITY, use_soft_clipped_bases=USE_SOFT_CLIPPED_BASES, + merge_overlapping_fragments=False, ): """ Parameters @@ -52,11 +55,17 @@ def __init__( use_soft_clipped_bases : bool Include soft-clipped positions on a read which were ignored by the aligner + + merge_overlapping_fragments : bool + Merge overlapping paired-end reads from the same fragment into one + fragment-level sequence while preserving the raw alignment count in + `source_read_count`. """ self.use_secondary_alignments = use_secondary_alignments self.use_duplicate_reads = use_duplicate_reads self.min_mapping_quality = min_mapping_quality self.use_soft_clipped_bases = use_soft_clipped_bases + self.merge_overlapping_fragments = merge_overlapping_fragments @staticmethod def _previous_fully_aligned_pair_index(aligned_pairs, start_index): @@ -512,8 +521,217 @@ def locus_read_from_pysam_aligned_segment( reference_base0_end_exclusive=base0_end_exclusive, read_base0_start_inclusive=read_base0_start_inclusive, read_base0_end_exclusive=read_base0_end_exclusive, + source_read_count=1, ) + @staticmethod + def _locus_read_sort_key(locus_read): + mapped_reference_positions = [ + ref_pos for ref_pos in locus_read.reference_positions if ref_pos is not None + ] + if mapped_reference_positions: + return ( + mapped_reference_positions[0], + mapped_reference_positions[-1], + len(locus_read.sequence), + ) + return (float("inf"), float("inf"), len(locus_read.sequence)) + + @staticmethod + def _base_tokens_from_locus_read(locus_read): + sequence = locus_read.sequence + reference_positions = locus_read.reference_positions + quality_scores = list(locus_read.quality_scores) + index_to_key = {} + tokens = [] + + i = 0 + n = len(sequence) + while i < n: + reference_position = reference_positions[i] + if reference_position is not None: + key = (2 * reference_position, 0) + tokens.append((key, reference_position, sequence[i], quality_scores[i])) + index_to_key[i] = key + i += 1 + continue + + j = i + while j < n and reference_positions[j] is None: + j += 1 + + previous_reference_position = None + for k in range(i - 1, -1, -1): + if reference_positions[k] is not None: + previous_reference_position = reference_positions[k] + break + + next_reference_position = None + for k in range(j, n): + if reference_positions[k] is not None: + next_reference_position = reference_positions[k] + break + + if previous_reference_position is None and next_reference_position is None: + return None, None + + if previous_reference_position is None: + anchor = 2 * next_reference_position - 1 + else: + anchor = 2 * previous_reference_position + 1 + + for offset, k in enumerate(range(i, j)): + key = (anchor, offset) + tokens.append((key, None, sequence[k], quality_scores[k])) + index_to_key[k] = key + i = j + + return tokens, index_to_key + + @staticmethod + def _translate_read_interval(locus_read, index_to_key, merged_index_by_key): + start = locus_read.read_base0_start_inclusive + end = locus_read.read_base0_end_exclusive + + if start is None or end is None: + return None + + if start < end: + merged_indices = [ + merged_index_by_key[index_to_key[i]] + for i in range(start, end) + ] + return min(merged_indices), max(merged_indices) + 1 + + if start == 0: + boundary = 0 + elif start == len(locus_read.sequence): + boundary = len(merged_index_by_key) + else: + boundary = merged_index_by_key[index_to_key[start]] + return boundary, boundary + + @classmethod + def _merge_locus_read_pair(cls, first, second): + if ( + first.name != second.name + or first.reference_base0_start_inclusive != second.reference_base0_start_inclusive + or first.reference_base0_end_exclusive != second.reference_base0_end_exclusive + ): + return None + + first_tokens, first_index_to_key = cls._base_tokens_from_locus_read(first) + second_tokens, second_index_to_key = cls._base_tokens_from_locus_read(second) + if first_tokens is None or second_tokens is None: + return None + + overlapping_keys = {key for key, _, _, _ in first_tokens}.intersection( + key for key, _, _, _ in second_tokens + ) + if not overlapping_keys: + return None + + merged_tokens = {} + for token in first_tokens + second_tokens: + key, reference_position, base, quality_score = token + existing = merged_tokens.get(key) + if existing is None: + merged_tokens[key] = token + continue + + _, existing_reference_position, existing_base, existing_quality_score = existing + if existing_reference_position != reference_position: + return None + + if base == existing_base: + merged_tokens[key] = ( + key, + reference_position, + base, + max(existing_quality_score, quality_score), + ) + elif quality_score > existing_quality_score: + merged_tokens[key] = token + elif quality_score == existing_quality_score: + return None + + merged_sequence = [] + merged_reference_positions = [] + merged_quality_scores = [] + merged_index_by_key = {} + + for merged_index, key in enumerate(sorted(merged_tokens)): + _, reference_position, base, quality_score = merged_tokens[key] + merged_sequence.append(base) + merged_reference_positions.append(reference_position) + merged_quality_scores.append(quality_score) + merged_index_by_key[key] = merged_index + + translated_intervals = [ + translated_interval + for translated_interval in [ + cls._translate_read_interval(first, first_index_to_key, merged_index_by_key), + cls._translate_read_interval(second, second_index_to_key, merged_index_by_key), + ] + if translated_interval is not None + ] + if not translated_intervals: + return None + + non_empty_intervals = [ + interval for interval in translated_intervals if interval[0] != interval[1] + ] + if non_empty_intervals: + read_base0_start_inclusive = min(start for start, _ in non_empty_intervals) + read_base0_end_exclusive = max(end for _, end in non_empty_intervals) + else: + read_base0_start_inclusive = read_base0_end_exclusive = translated_intervals[0][0] + + return LocusRead( + name=first.name, + sequence="".join(merged_sequence), + reference_positions=merged_reference_positions, + quality_scores=merged_quality_scores, + reference_base0_start_inclusive=first.reference_base0_start_inclusive, + reference_base0_end_exclusive=first.reference_base0_end_exclusive, + read_base0_start_inclusive=read_base0_start_inclusive, + read_base0_end_exclusive=read_base0_end_exclusive, + source_read_count=first.source_read_count + second.source_read_count, + ) + + @classmethod + def _merge_overlapping_locus_reads(cls, reads): + grouped_reads = defaultdict(list) + for read in reads: + grouped_reads[read.name].append(read) + + merged_reads = [] + for grouped_read_list in grouped_reads.values(): + pending_reads = sorted(grouped_read_list, key=cls._locus_read_sort_key) + changed = True + while changed and len(pending_reads) > 1: + changed = False + next_round = [] + used = [False] * len(pending_reads) + for i, read in enumerate(pending_reads): + if used[i]: + continue + merged_read = read + for j in range(i + 1, len(pending_reads)): + if used[j]: + continue + candidate = cls._merge_locus_read_pair(merged_read, pending_reads[j]) + if candidate is None: + continue + merged_read = candidate + used[j] = True + changed = True + used[i] = True + next_round.append(merged_read) + pending_reads = sorted(next_round, key=cls._locus_read_sort_key) + merged_reads.extend(pending_reads) + return merged_reads + def get_locus_reads( self, alignment_file, @@ -530,6 +748,12 @@ def get_locus_reads( those positions matches a variant happens later when LocusRead objects are converted to AlleleRead objects. + If `merge_overlapping_fragments` is enabled then overlapping paired-end + reads from the same fragment are conservatively merged so downstream + assembly sees one fragment-spanning sequence instead of double-counting + the overlap. The raw number of source alignments is retained on each + merged LocusRead via `source_read_count`. + Parameters ---------- alignment_file : pysam.AlignmentFile @@ -589,7 +813,15 @@ def get_locus_reads( base0_start_inclusive, base0_end_exclusive, ) - return reads + if not self.merge_overlapping_fragments: + return reads + + merged_reads = self._merge_overlapping_locus_reads(reads) + logger.info( + "Merged overlapping paired reads into %d locus reads", + len(merged_reads), + ) + return merged_reads @staticmethod def _infer_chromosome_name(variant_chromosome_name, valid_chromosome_names): diff --git a/isovar/varcode_adapter.py b/isovar/varcode_adapter.py index 306e68f..9734fa5 100644 --- a/isovar/varcode_adapter.py +++ b/isovar/varcode_adapter.py @@ -128,7 +128,10 @@ def mutant_transcript(self, variant, transcript): result, translation, matched_transcript = entry transcript_edit = self._transcript_edit(result, matched_transcript) evidence = { - "num_supporting_reads": len(translation.reads), + "num_supporting_reads": sum( + getattr(read, "source_read_count", 1) + for read in translation.reads + ), "num_supporting_fragments": len({read.name for read in translation.reads}), "num_cdna_mismatches_before_variant": ( translation.num_mismatches_before_variant), diff --git a/tests/test_allele_counts.py b/tests/test_allele_counts.py index 507efc9..faceba7 100644 --- a/tests/test_allele_counts.py +++ b/tests/test_allele_counts.py @@ -13,7 +13,10 @@ from isovar.dataframe_helpers import allele_counts_dataframe from isovar.allele_read import AlleleRead from isovar.read_evidence import ReadEvidence +from isovar.read_collector import ReadCollector from varcode import Variant + +from .mock_objects import MockAlignmentFile, make_pysam_read from .common import eq_ @@ -47,5 +50,43 @@ def test_allele_count_dataframe(): eq_(row.num_other_fragments, 2) +def test_allele_count_dataframe_preserves_raw_read_count_after_fragment_merge(): + """ + Overlapping mates from one fragment should be merged for assembly while the + read-count columns still report both raw reads. + + Regression test for GitHub issue #59. + """ + variant = Variant("1", 4, "T", "G", normalize_contig_names=False) + left_mate = make_pysam_read( + seq="ACCGTG", + cigar="6M", + name="fragment-1", + reference_start=0, + ) + right_mate = make_pysam_read( + seq="CGTGAA", + cigar="6M", + name="fragment-1", + reference_start=2, + ) + read_evidence = ReadCollector( + merge_overlapping_fragments=True + ).read_evidence_for_variant( + variant=variant, + alignment_file=MockAlignmentFile( + references=("1",), + reads=[left_mate, right_mate], + ), + ) + eq_(len(read_evidence.alt_reads), 1) + eq_(read_evidence.alt_reads[0].sequence, "ACCGTGAA") + + df = allele_counts_dataframe([(variant, read_evidence)]) + row = df.iloc[0] + eq_(row.num_alt_reads, 2) + eq_(row.num_alt_fragments, 1) + + if __name__ == "__main__": test_allele_count_dataframe() diff --git a/tests/test_locus_reads.py b/tests/test_locus_reads.py index d0f8ea4..7268b45 100644 --- a/tests/test_locus_reads.py +++ b/tests/test_locus_reads.py @@ -242,6 +242,15 @@ def test_locus_reads_dataframe(): n_reads_expected -= 1 print("Found %d sequences in %s" % (n_reads_expected, sam_path_single_variant)) + read_collector = ReadCollector(merge_overlapping_fragments=True) + reads = read_collector.get_locus_reads( + sam_all_variants, + "chr4", + 45802538, + 45802539, + ) + eq_(sum(read.source_read_count for read in reads), n_reads_expected) + df = locus_reads_dataframe( alignments=sam_all_variants, chromosome="chr4", @@ -251,6 +260,41 @@ def test_locus_reads_dataframe(): eq_(len(df), n_reads_expected) +def test_get_locus_reads_merges_overlapping_paired_reads(): + """ + Overlapping paired-end reads from the same fragment should collapse into a + single longer LocusRead while retaining the raw source read count. + + Regression test for GitHub issue #59. + """ + left_mate = make_pysam_read( + seq="ACCGTG", + cigar="6M", + name="fragment-1", + reference_start=0, + ) + right_mate = make_pysam_read( + seq="CGTGAA", + cigar="6M", + name="fragment-1", + reference_start=2, + ) + read_collector = ReadCollector(merge_overlapping_fragments=True) + reads = read_collector.get_locus_reads( + MockAlignmentFile(references=("chromosome",), reads=[left_mate, right_mate]), + "chromosome", + base0_start_inclusive=3, + base0_end_exclusive=4, + ) + eq_(len(reads), 1) + merged_read = reads[0] + eq_(merged_read.sequence, "ACCGTGAA") + eq_(merged_read.reference_positions, list(range(8))) + eq_(merged_read.read_base0_start_inclusive, 3) + eq_(merged_read.read_base0_end_exclusive, 4) + eq_(merged_read.source_read_count, 2) + + def test_locus_reads_clamps_fetch_start_at_contig_boundary(): """ Querying a locus at the first aligned base of a contig should not widen the