diff --git a/isovar/__init__.py b/isovar/__init__.py index d89ef68..a920811 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.5.3" +__version__ = "1.5.4" from .allele_read import AlleleRead @@ -23,6 +23,7 @@ from .protein_sequence_creator import ProteinSequenceCreator from .read_collector import ReadCollector from .read_evidence import ReadEvidence +from .transcript_assembly_edit import TranscriptAssemblyEdit from .variant_orf import VariantORF from .variant_sequence import VariantSequence from .variant_sequence_creator import VariantSequenceCreator @@ -40,6 +41,7 @@ "ProteinSequenceCreator", "ReadCollector", "ReadEvidence", + "TranscriptAssemblyEdit", "VariantORF", "VariantSequence", "VariantSequenceCreator", diff --git a/isovar/phase_group.py b/isovar/phase_group.py index 10317a1..6f75d51 100644 --- a/isovar/phase_group.py +++ b/isovar/phase_group.py @@ -30,9 +30,9 @@ class PhaseGroup(ValueObject): public model can grow into that use case without changing shape again. Some phase groups are backed by translated Isovar assemblies. In those - cases the group can also carry directly observed cDNA, protein, and - transcript metadata from the supporting assemblies. Read-only phasing groups - leave those fields empty. + cases the group can also carry directly observed cDNA, protein, transcript, + and transcript-edit metadata from the supporting assemblies. Read-only + phasing groups leave those fields empty. """ __slots__ = [ @@ -43,6 +43,9 @@ class PhaseGroup(ValueObject): "mutant_protein_sequences", "transcript_ids", "transcript_names", + "known_somatic_transcript_edits", + "known_germline_transcript_edits", + "unexplained_transcript_edits", ] def __init__( @@ -53,7 +56,10 @@ def __init__( cdna_sequences=(), mutant_protein_sequences=(), transcript_ids=(), - transcript_names=()): + transcript_names=(), + known_somatic_transcript_edits=(), + known_germline_transcript_edits=(), + unexplained_transcript_edits=()): self.somatic_variants = tuple(somatic_variants) self.germline_variants = tuple(germline_variants) self.supporting_read_names = frozenset(supporting_read_names) @@ -61,3 +67,8 @@ def __init__( self.mutant_protein_sequences = tuple(mutant_protein_sequences) self.transcript_ids = tuple(transcript_ids) self.transcript_names = tuple(transcript_names) + self.known_somatic_transcript_edits = tuple( + known_somatic_transcript_edits) + self.known_germline_transcript_edits = tuple( + known_germline_transcript_edits) + self.unexplained_transcript_edits = tuple(unexplained_transcript_edits) diff --git a/isovar/phasing.py b/isovar/phasing.py index 0243f6c..49f148c 100644 --- a/isovar/phasing.py +++ b/isovar/phasing.py @@ -14,6 +14,7 @@ from .default_parameters import MIN_SHARED_FRAGMENTS_FOR_PHASING from .phase_group import PhaseGroup +from .transcript_edit_helpers import transcript_assembly_edit_sort_key def _variant_sort_key(variant): @@ -205,21 +206,54 @@ def create_phase_groups( mutant_protein_sequences = () transcript_ids = () transcript_names = () + known_somatic_transcript_edits = () + known_germline_transcript_edits = () + unexplained_transcript_edits = () else: cdna_sequences = set() mutant_protein_sequences = set() transcript_ids = set() transcript_names = set() + known_somatic_transcript_edits = set() + known_germline_transcript_edits = set() + unexplained_transcript_edits = set() for grouped_variant in component: protein_sequence = variant_to_top_protein_sequence_dict.get( grouped_variant ) if protein_sequence is None: continue + if hasattr(protein_sequence, "_transcript_assembly_edits_by_category"): + categorized_edits = ( + protein_sequence._transcript_assembly_edits_by_category()) + else: + categorized_edits = { + "known_somatic": getattr( + protein_sequence, + "known_somatic_transcript_edits", + (), + ), + "known_germline": getattr( + protein_sequence, + "known_germline_transcript_edits", + (), + ), + "unexplained": getattr( + protein_sequence, + "unexplained_transcript_edits", + (), + ), + } cdna_sequences.update(protein_sequence.cdna_sequences) mutant_protein_sequences.add(protein_sequence.amino_acids) transcript_ids.update(protein_sequence.transcript_ids) transcript_names.update(protein_sequence.transcript_names) + known_somatic_transcript_edits.update( + categorized_edits["known_somatic"]) + known_germline_transcript_edits.update( + categorized_edits["known_germline"]) + unexplained_transcript_edits.update( + categorized_edits["unexplained"]) phase_group = PhaseGroup( somatic_variants=tuple(sorted(component, key=_variant_sort_key)), @@ -229,6 +263,18 @@ def create_phase_groups( mutant_protein_sequences=tuple(sorted(mutant_protein_sequences)), transcript_ids=tuple(sorted(transcript_ids)), transcript_names=tuple(sorted(transcript_names)), + known_somatic_transcript_edits=tuple(sorted( + known_somatic_transcript_edits, + key=transcript_assembly_edit_sort_key, + )), + known_germline_transcript_edits=tuple(sorted( + known_germline_transcript_edits, + key=transcript_assembly_edit_sort_key, + )), + unexplained_transcript_edits=tuple(sorted( + unexplained_transcript_edits, + key=transcript_assembly_edit_sort_key, + )), ) for grouped_variant in component: variant_to_phase_group[grouped_variant] = phase_group diff --git a/isovar/protein_sequence.py b/isovar/protein_sequence.py index 1be6fe5..76051c1 100644 --- a/isovar/protein_sequence.py +++ b/isovar/protein_sequence.py @@ -17,6 +17,10 @@ """ from .common import normalize_base0_range_indices +from .transcript_edit_helpers import ( + categorize_transcript_assembly_edits_from_translation, + transcript_assembly_edit_sort_key, +) from .translation_key import TranslationKey from .translation import Translation # noqa: F401 from .logging import get_logger @@ -258,6 +262,54 @@ def transcript_ids(self): for transcript in self.transcripts ] + def _transcript_assembly_edits_by_category(self): + categorized_edits = { + "known_somatic": set(), + "known_germline": set(), + "unexplained": set(), + } + for translation in self.translations: + for transcript in translation.reference_context.transcripts: + translation_edits = ( + categorize_transcript_assembly_edits_from_translation( + translation, + transcript, + ) + ) + for category, edits in translation_edits.items(): + categorized_edits[category].update(edits) + return { + category: tuple(sorted( + edits, + key=transcript_assembly_edit_sort_key, + )) + for category, edits in categorized_edits.items() + } + + @property + def known_somatic_transcript_edits(self): + """ + Transcript-relative edits from known somatic variants observed in the + supporting assemblies for this protein sequence. + """ + return self._transcript_assembly_edits_by_category()["known_somatic"] + + @property + def known_germline_transcript_edits(self): + """ + Transcript-relative edits from known germline variants observed in the + supporting assemblies for this protein sequence. + """ + return self._transcript_assembly_edits_by_category()["known_germline"] + + @property + def unexplained_transcript_edits(self): + """ + Transcript-relative edits observed in the supporting assemblies which + are not yet matched to a known input variant. + """ + return self._transcript_assembly_edits_by_category()["unexplained"] + @property def genes(self): """ diff --git a/isovar/transcript_assembly_edit.py b/isovar/transcript_assembly_edit.py new file mode 100644 index 0000000..571fac3 --- /dev/null +++ b/isovar/transcript_assembly_edit.py @@ -0,0 +1,49 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Transcript-anchored edit derived from a local Isovar assembly. +""" + +from .value_object import ValueObject + + +class TranscriptAssemblyEdit(ValueObject): + """ + Wrap a TranscriptEdit with the transcript it is relative to. + + TranscriptEdit coordinates only make sense for a particular transcript + sequence, so PhaseGroup stores this transcript-anchored form rather than + bare edits. + """ + + __slots__ = [ + "transcript_id", + "transcript_name", + "edit", + ] + + @property + def cdna_start(self): + return self.edit.cdna_start + + @property + def cdna_end(self): + return self.edit.cdna_end + + @property + def alt_bases(self): + return self.edit.alt_bases + + @property + def source_variant(self): + return self.edit.source_variant diff --git a/isovar/transcript_edit_helpers.py b/isovar/transcript_edit_helpers.py new file mode 100644 index 0000000..049f03c --- /dev/null +++ b/isovar/transcript_edit_helpers.py @@ -0,0 +1,220 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Helpers for extracting transcript-relative edits from local Isovar assemblies. +""" + +from difflib import SequenceMatcher + +from varcode.mutant_transcript import TranscriptEdit + +from .dna import reverse_complement_dna +from .transcript_assembly_edit import TranscriptAssemblyEdit +from .variant_helpers import ( + interbase_range_affected_by_variant_on_transcript, + trim_variant_fields, +) + + +class TrimmedVariant(object): + def __init__(self, start, ref, alt): + self.start = start + self.ref = ref + self.alt = alt + + @property + def is_insertion(self): + return len(self.ref) == 0 and len(self.alt) > 0 + + +def trimmed_variant_from_variant(variant): + start, ref, alt = trim_variant_fields(variant.start, variant.ref, variant.alt) + return TrimmedVariant(start=start, ref=ref, alt=alt) + + +def transcript_edit_from_variant(variant, transcript): + trimmed_variant = trimmed_variant_from_variant(variant) + cdna_start, cdna_end = interbase_range_affected_by_variant_on_transcript( + trimmed_variant, + transcript, + ) + + alt_bases = trimmed_variant.alt + if getattr(transcript, "strand", None) == "-": + alt_bases = reverse_complement_dna(alt_bases) + + return TranscriptEdit( + cdna_start=cdna_start, + cdna_end=cdna_end, + alt_bases=alt_bases, + source_variant=variant, + ) + + +def transcript_edits_from_sequence_diff( + reference_sequence, + observed_sequence, + start_offset, + ignore_terminal_deletions=False, + ignore_terminal_insertions=False): + """ + Convert a local sequence difference into TranscriptEdit objects. + + This is intentionally conservative about trailing differences since Isovar + assemblies may stop before the reference context does. + """ + edits = [] + + if len(reference_sequence) == len(observed_sequence): + mismatch_start = None + for idx, (ref_base, observed_base) in enumerate(zip( + reference_sequence, observed_sequence)): + if ref_base != observed_base: + if mismatch_start is None: + mismatch_start = idx + elif mismatch_start is not None: + edits.append( + TranscriptEdit( + cdna_start=start_offset + mismatch_start, + cdna_end=start_offset + idx, + alt_bases=observed_sequence[mismatch_start:idx], + source_variant=None, + ) + ) + mismatch_start = None + + if mismatch_start is not None: + edits.append( + TranscriptEdit( + cdna_start=start_offset + mismatch_start, + cdna_end=start_offset + len(reference_sequence), + alt_bases=observed_sequence[mismatch_start:], + source_variant=None, + ) + ) + return tuple(edits) + + matcher = SequenceMatcher( + a=reference_sequence, + b=observed_sequence, + autojunk=False, + ) + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == "equal": + continue + + at_reference_end = i2 == len(reference_sequence) + at_observed_end = j2 == len(observed_sequence) + + if ( + tag == "delete" and + ignore_terminal_deletions and + at_reference_end and + at_observed_end): + continue + + if ( + tag == "insert" and + ignore_terminal_insertions and + at_reference_end and + at_observed_end): + continue + + edits.append( + TranscriptEdit( + cdna_start=start_offset + i1, + cdna_end=start_offset + i2, + alt_bases=observed_sequence[j1:j2], + source_variant=None, + ) + ) + return tuple(edits) + + +def unexplained_transcript_edits_from_translation(translation, transcript): + """ + Diff the matched local cDNA against the reference transcript window. + """ + focal_edit = transcript_edit_from_variant( + translation.reference_context.variant, + transcript, + ) + + reference_prefix = translation.reference_cdna_sequence_before_variant + observed_prefix = translation.cdna_sequence[:translation.variant_cdna_interval_start] + prefix_start = focal_edit.cdna_start - len(reference_prefix) + prefix_edits = transcript_edits_from_sequence_diff( + reference_sequence=reference_prefix, + observed_sequence=observed_prefix, + start_offset=prefix_start, + ) + + reference_suffix = translation.variant_orf.reference_cdna_sequence_after_variant + observed_suffix = translation.cdna_sequence[translation.variant_cdna_interval_end:] + suffix_edits = transcript_edits_from_sequence_diff( + reference_sequence=reference_suffix, + observed_sequence=observed_suffix, + start_offset=focal_edit.cdna_end, + ignore_terminal_deletions=True, + ignore_terminal_insertions=True, + ) + return prefix_edits + suffix_edits + + +def _transcript_assembly_edit(transcript, edit): + return TranscriptAssemblyEdit( + transcript_id=getattr(transcript, "id", ""), + transcript_name=getattr(transcript, "name", ""), + edit=edit, + ) + + +def _source_variant_sort_key(variant): + if variant is None: + return ("", -1, "", "") + return ( + getattr(variant, "contig", ""), + getattr(variant, "start", -1), + getattr(variant, "ref", ""), + getattr(variant, "alt", ""), + ) + + +def transcript_assembly_edit_sort_key(transcript_assembly_edit): + return ( + transcript_assembly_edit.transcript_id, + transcript_assembly_edit.transcript_name, + transcript_assembly_edit.cdna_start, + transcript_assembly_edit.cdna_end, + transcript_assembly_edit.alt_bases, + _source_variant_sort_key(transcript_assembly_edit.source_variant), + ) + + +def categorize_transcript_assembly_edits_from_translation(translation, transcript): + known_somatic_edit = _transcript_assembly_edit( + transcript, + transcript_edit_from_variant(translation.reference_context.variant, transcript), + ) + unexplained_edits = tuple( + _transcript_assembly_edit(transcript, edit) + for edit in unexplained_transcript_edits_from_translation( + translation, + transcript, + ) + ) + return { + "known_somatic": (known_somatic_edit,), + "known_germline": (), + "unexplained": unexplained_edits, + } diff --git a/isovar/varcode_adapter.py b/isovar/varcode_adapter.py index 9734fa5..9e93eba 100644 --- a/isovar/varcode_adapter.py +++ b/isovar/varcode_adapter.py @@ -15,19 +15,15 @@ This is intentionally conservative: it exposes transcript-keyed assembled contigs from existing Translation objects and surfaces already-known phased RNA -variants on those contigs. It does not yet discover germline SNPs or unexplained -contig edits directly from the assembled cDNA sequence. +variants on those contigs. It surfaces transcript-relative edits observed in +the assembled cDNA, but it does not yet discover germline SNPs as separate +known variants. """ -from collections import namedtuple - -from .dna import reverse_complement_dna -from .variant_helpers import interbase_range_affected_by_variant_on_transcript - - -TrimmedVariant = namedtuple( - "TrimmedVariant", - ["start", "ref", "alt", "is_insertion"]) +from .transcript_edit_helpers import ( + categorize_transcript_assembly_edits_from_translation, + transcript_assembly_edit_sort_key, +) class VarcodeAdapter(object): @@ -71,37 +67,6 @@ def _entry(self, variant, transcript): self._transcript_key(transcript), )) - @staticmethod - def _trimmed_variant_from_result(result): - ref = result.read_evidence.trimmed_ref - alt = result.read_evidence.trimmed_alt - return TrimmedVariant( - start=result.read_evidence.trimmed_base1_start, - ref=ref, - alt=alt, - is_insertion=len(ref) == 0 and len(alt) > 0) - - def _transcript_edit(self, result, transcript): - from varcode.mutant_transcript import TranscriptEdit - - trimmed_variant = self._trimmed_variant_from_result(result) - try: - cdna_start, cdna_end = interbase_range_affected_by_variant_on_transcript( - trimmed_variant, - transcript) - except ValueError: - return None - - alt_bases = trimmed_variant.alt - if getattr(transcript, "strand", None) == "-": - alt_bases = reverse_complement_dna(alt_bases) - - return TranscriptEdit( - cdna_start=cdna_start, - cdna_end=cdna_end, - alt_bases=alt_bases, - source_variant=result.variant) - def has_contig(self, variant, transcript): return self._entry(variant, transcript) is not None @@ -125,8 +90,15 @@ def mutant_transcript(self, variant, transcript): from varcode.mutant_transcript import MutantTranscript - result, translation, matched_transcript = entry - transcript_edit = self._transcript_edit(result, matched_transcript) + _, translation, matched_transcript = entry + categorized_edits = categorize_transcript_assembly_edits_from_translation( + translation, + matched_transcript, + ) + transcript_assembly_edits = [] + for category in ("known_somatic", "known_germline", "unexplained"): + transcript_assembly_edits.extend(categorized_edits[category]) + transcript_assembly_edits.sort(key=transcript_assembly_edit_sort_key) evidence = { "num_supporting_reads": sum( getattr(read, "source_read_count", 1) @@ -138,7 +110,10 @@ def mutant_transcript(self, variant, transcript): "num_cdna_mismatches_after_variant": ( translation.num_mismatches_after_variant), } - edits = (transcript_edit,) if transcript_edit is not None else () + edits = tuple( + transcript_assembly_edit.edit + for transcript_assembly_edit in transcript_assembly_edits + ) return MutantTranscript( reference_transcript=matched_transcript, edits=edits, diff --git a/tests/test_phasing.py b/tests/test_phasing.py index d18071f..84ffdda 100644 --- a/tests/test_phasing.py +++ b/tests/test_phasing.py @@ -11,11 +11,13 @@ # limitations under the License. from varcode import Variant +from varcode.mutant_transcript import TranscriptEdit from isovar.allele_read import AlleleRead from isovar.isovar_result import IsovarResult from isovar.phasing import annotate_phased_variants from isovar.read_evidence import ReadEvidence +from isovar.transcript_assembly_edit import TranscriptAssemblyEdit from .common import eq_ @@ -27,12 +29,38 @@ def __init__( amino_acids, cdna_sequences, transcript_ids, - transcript_names): + transcript_names, + known_somatic_transcript_edits=(), + known_germline_transcript_edits=(), + unexplained_transcript_edits=()): self.read_names_supporting_protein_sequence = set(read_names) self.amino_acids = amino_acids self.cdna_sequences = set(cdna_sequences) self.transcript_ids = list(transcript_ids) self.transcript_names = list(transcript_names) + self.known_somatic_transcript_edits = tuple(known_somatic_transcript_edits) + self.known_germline_transcript_edits = tuple( + known_germline_transcript_edits) + self.unexplained_transcript_edits = tuple(unexplained_transcript_edits) + + +def make_transcript_assembly_edit( + transcript_id, + transcript_name, + cdna_start, + cdna_end, + alt_bases, + source_variant=None): + return TranscriptAssemblyEdit( + transcript_id=transcript_id, + transcript_name=transcript_name, + edit=TranscriptEdit( + cdna_start=cdna_start, + cdna_end=cdna_end, + alt_bases=alt_bases, + source_variant=source_variant, + ), + ) def make_isovar_result( @@ -42,7 +70,10 @@ def make_isovar_result( amino_acids=None, cdna_sequences=(), transcript_ids=(), - transcript_names=()): + transcript_names=(), + known_somatic_transcript_edits=(), + known_germline_transcript_edits=(), + unexplained_transcript_edits=()): read_evidence = ReadEvidence( trimmed_base1_start=variant.start, trimmed_ref=variant.ref, @@ -64,6 +95,9 @@ def make_isovar_result( cdna_sequences=cdna_sequences, transcript_ids=transcript_ids, transcript_names=transcript_names, + known_somatic_transcript_edits=known_somatic_transcript_edits, + known_germline_transcript_edits=known_germline_transcript_edits, + unexplained_transcript_edits=unexplained_transcript_edits, )], ) @@ -72,6 +106,16 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): v1 = Variant("1", 10, "A", "C", normalize_contig_names=False) v2 = Variant("1", 11, "G", "T", normalize_contig_names=False) v3 = Variant("1", 12, "T", "G", normalize_contig_names=False) + germline = Variant("1", 9, "A", "G", normalize_contig_names=False) + + somatic_edit_v1 = make_transcript_assembly_edit("tx-1", "TX1", 10, 11, "C", v1) + somatic_edit_v2 = make_transcript_assembly_edit("tx-2", "TX2", 20, 21, "T", v2) + somatic_edit_v3 = make_transcript_assembly_edit("tx-2", "TX2", 30, 31, "G", v3) + germline_edit = make_transcript_assembly_edit( + "tx-shared", "TX_SHARED", 7, 8, "G", germline) + unexplained_edit_v1 = make_transcript_assembly_edit("tx-1", "TX1", 4, 5, "G") + unexplained_edit_v2 = make_transcript_assembly_edit( + "tx-shared", "TX_SHARED", 24, 24, "TT") results = annotate_phased_variants( [ @@ -83,6 +127,9 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): cdna_sequences={"cdna-1"}, transcript_ids={"tx-1", "tx-shared"}, transcript_names={"TX1", "TX_SHARED"}, + known_somatic_transcript_edits=(somatic_edit_v1,), + known_germline_transcript_edits=(germline_edit,), + unexplained_transcript_edits=(unexplained_edit_v1,), ), make_isovar_result( v2, @@ -92,6 +139,9 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): cdna_sequences={"cdna-2"}, transcript_ids={"tx-2", "tx-shared"}, transcript_names={"TX2", "TX_SHARED"}, + known_somatic_transcript_edits=(somatic_edit_v2,), + known_germline_transcript_edits=(germline_edit,), + unexplained_transcript_edits=(unexplained_edit_v2,), ), make_isovar_result( v3, @@ -101,6 +151,7 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): cdna_sequences={"cdna-3"}, transcript_ids={"tx-2"}, transcript_names={"TX2"}, + known_somatic_transcript_edits=(somatic_edit_v3,), ), ], min_shared_fragments_for_phasing=1, @@ -114,6 +165,16 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): expected_protein_sequences = ("AA-1", "AA-2", "AA-3") expected_transcript_ids = ("tx-1", "tx-2", "tx-shared") expected_transcript_names = ("TX1", "TX2", "TX_SHARED") + expected_known_somatic_edits = ( + somatic_edit_v1, + somatic_edit_v2, + somatic_edit_v3, + ) + expected_known_germline_edits = (germline_edit,) + expected_unexplained_edits = ( + unexplained_edit_v1, + unexplained_edit_v2, + ) expected_supporting_neighbors = { v1: {v2}, v2: {v1, v3}, @@ -134,6 +195,9 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): eq_(supporting_group.mutant_protein_sequences, ()) eq_(supporting_group.transcript_ids, ()) eq_(supporting_group.transcript_names, ()) + eq_(supporting_group.known_somatic_transcript_edits, ()) + eq_(supporting_group.known_germline_transcript_edits, ()) + eq_(supporting_group.unexplained_transcript_edits, ()) eq_(protein_group.somatic_variants, expected_group_variants) eq_(protein_group.germline_variants, ()) @@ -142,6 +206,18 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): eq_(protein_group.mutant_protein_sequences, expected_protein_sequences) eq_(protein_group.transcript_ids, expected_transcript_ids) eq_(protein_group.transcript_names, expected_transcript_names) + eq_( + protein_group.known_somatic_transcript_edits, + expected_known_somatic_edits, + ) + eq_( + protein_group.known_germline_transcript_edits, + expected_known_germline_edits, + ) + eq_( + protein_group.unexplained_transcript_edits, + expected_unexplained_edits, + ) eq_( result.phased_variants_in_supporting_reads, diff --git a/tests/test_transcript_edits.py b/tests/test_transcript_edits.py new file mode 100644 index 0000000..024533f --- /dev/null +++ b/tests/test_transcript_edits.py @@ -0,0 +1,205 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from types import SimpleNamespace + +from varcode import Variant + +from isovar import ProteinSequence, VarcodeAdapter +from isovar.allele_read import AlleleRead +from isovar.isovar_result import IsovarResult +from isovar.read_evidence import ReadEvidence +from isovar.translation import Translation +from isovar.transcript_assembly_edit import TranscriptAssemblyEdit +from isovar.transcript_edit_helpers import ( + categorize_transcript_assembly_edits_from_translation, +) +from isovar.variant_orf import VariantORF + +from .common import eq_ + + +@dataclass(frozen=True, order=True) +class DummyTranscript(object): + id: str + name: str + strand: str = "+" + genomic_offset: int = 100 + + def spliced_offset(self, dna_pos): + return dna_pos - self.genomic_offset + + +def make_translation( + variant, + transcript, + reference_prefix="TTAA", + observed_prefix="TGAA", + observed_alt="G", + reference_suffix="CCGG", + observed_suffix="CCAG"): + cdna_sequence = observed_prefix + observed_alt + observed_suffix + variant_orf = VariantORF( + cdna_sequence=cdna_sequence, + offset_to_first_complete_codon=0, + variant_cdna_interval_start=len(observed_prefix), + variant_cdna_interval_end=len(observed_prefix) + len(observed_alt), + reference_cdna_sequence_before_variant=reference_prefix, + reference_cdna_sequence_after_variant=reference_suffix, + num_mismatches_before_variant=1, + num_mismatches_after_variant=1, + ) + reference_context = SimpleNamespace( + variant=variant, + transcripts=(transcript,), + ) + return Translation( + amino_acids="M", + contains_mutation=True, + mutation_start_idx=0, + mutation_end_idx=1, + ends_with_stop_codon=False, + frameshift=False, + untrimmed_variant_sequence=SimpleNamespace(reads=[]), + reference_context=reference_context, + variant_orf=variant_orf, + ) + + +def make_expected_edit( + transcript, + cdna_start, + cdna_end, + alt_bases, + source_variant=None): + from varcode.mutant_transcript import TranscriptEdit + + return TranscriptAssemblyEdit( + transcript_id=transcript.id, + transcript_name=transcript.name, + edit=TranscriptEdit( + cdna_start=cdna_start, + cdna_end=cdna_end, + alt_bases=alt_bases, + source_variant=source_variant, + ), + ) + + +def make_isovar_result(variant, protein_sequence): + read_evidence = ReadEvidence( + trimmed_base1_start=variant.start, + trimmed_ref=variant.ref, + trimmed_alt=variant.alt, + ref_reads=[], + alt_reads=[ + AlleleRead(prefix="A", allele=variant.alt, suffix="T", name="read-1") + ], + other_reads=[], + ) + return IsovarResult( + variant=variant, + read_evidence=read_evidence, + predicted_effect=None, + sorted_protein_sequences=[protein_sequence], + ) + + +def test_categorize_transcript_assembly_edits_from_translation(): + variant = Variant("1", 105, "A", "G", normalize_contig_names=False) + transcript = DummyTranscript(id="tx-1", name="TX1") + translation = make_translation(variant, transcript) + + categorized_edits = categorize_transcript_assembly_edits_from_translation( + translation, + transcript, + ) + + eq_( + categorized_edits["known_somatic"], + (make_expected_edit(transcript, 5, 6, "G", variant),), + ) + eq_(categorized_edits["known_germline"], ()) + eq_( + categorized_edits["unexplained"], + ( + make_expected_edit(transcript, 2, 3, "G"), + make_expected_edit(transcript, 8, 9, "A"), + ), + ) + + +def test_categorize_transcript_assembly_edits_ignores_terminal_suffix_gap(): + variant = Variant("1", 105, "A", "G", normalize_contig_names=False) + transcript = DummyTranscript(id="tx-1", name="TX1") + translation = make_translation( + variant, + transcript, + reference_suffix="CCGG", + observed_suffix="CC", + ) + + categorized_edits = categorize_transcript_assembly_edits_from_translation( + translation, + transcript, + ) + + eq_( + categorized_edits["known_somatic"], + (make_expected_edit(transcript, 5, 6, "G", variant),), + ) + eq_(categorized_edits["known_germline"], ()) + eq_(categorized_edits["unexplained"], (make_expected_edit(transcript, 2, 3, "G"),)) + + +def test_protein_sequence_exposes_deduplicated_transcript_assembly_edits(): + variant = Variant("1", 105, "A", "G", normalize_contig_names=False) + transcript = DummyTranscript(id="tx-1", name="TX1") + translation = make_translation(variant, transcript) + protein_sequence = ProteinSequence.from_translations([translation, translation]) + + eq_( + protein_sequence.known_somatic_transcript_edits, + (make_expected_edit(transcript, 5, 6, "G", variant),), + ) + eq_(protein_sequence.known_germline_transcript_edits, ()) + eq_( + protein_sequence.unexplained_transcript_edits, + ( + make_expected_edit(transcript, 2, 3, "G"), + make_expected_edit(transcript, 8, 9, "A"), + ), + ) + + +def test_varcode_adapter_mutant_transcript_includes_unexplained_edits(): + variant = Variant("1", 105, "A", "G", normalize_contig_names=False) + transcript = DummyTranscript(id="tx-1", name="TX1") + translation = make_translation(variant, transcript) + protein_sequence = ProteinSequence.from_translations([translation]) + provider = VarcodeAdapter([make_isovar_result(variant, protein_sequence)]) + + mutant_transcript = provider.mutant_transcript(variant, transcript) + + assert mutant_transcript is not None + eq_( + tuple( + (edit.cdna_start, edit.cdna_end, edit.alt_bases, edit.source_variant) + for edit in mutant_transcript.edits + ), + ( + (2, 3, "G", None), + (5, 6, "G", variant), + (8, 9, "A", None), + ), + ) diff --git a/tests/test_varcode_adapter.py b/tests/test_varcode_adapter.py index 4a951d1..5a1ffb6 100644 --- a/tests/test_varcode_adapter.py +++ b/tests/test_varcode_adapter.py @@ -58,8 +58,12 @@ def test_varcode_adapter_exposes_contig_and_mutant_transcript(result_and_transcr assert mutant_transcript.cdna_sequence in result.top_protein_sequence.cdna_sequences eq_(mutant_transcript.mutant_protein_sequence, result.top_protein_sequence.amino_acids) eq_(mutant_transcript.annotator_name, "isovar") - eq_(len(mutant_transcript.edits), 1) - eq_(mutant_transcript.edits[0].source_variant, result.variant) + assert len(mutant_transcript.edits) >= 1 + assert any( + edit.source_variant is not None and + edit.source_variant == result.variant + for edit in mutant_transcript.edits + ) def test_isovar_phase_resolver_applies_contig_mutant_transcript(result_and_transcript):