Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion isovar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "1.5.3"
__version__ = "1.5.4"


from .allele_read import AlleleRead
Expand All @@ -23,6 +23,7 @@
from .protein_sequence_creator import ProteinSequenceCreator
from .read_collector import ReadCollector
from .read_evidence import ReadEvidence
from .transcript_assembly_edit import TranscriptAssemblyEdit
from .variant_orf import VariantORF
from .variant_sequence import VariantSequence
from .variant_sequence_creator import VariantSequenceCreator
Expand All @@ -40,6 +41,7 @@
"ProteinSequenceCreator",
"ReadCollector",
"ReadEvidence",
"TranscriptAssemblyEdit",
"VariantORF",
"VariantSequence",
"VariantSequenceCreator",
Expand Down
19 changes: 15 additions & 4 deletions isovar/phase_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ class PhaseGroup(ValueObject):
public model can grow into that use case without changing shape again.

Some phase groups are backed by translated Isovar assemblies. In those
cases the group can also carry directly observed cDNA, protein, and
transcript metadata from the supporting assemblies. Read-only phasing groups
leave those fields empty.
cases the group can also carry directly observed cDNA, protein, transcript,
and transcript-edit metadata from the supporting assemblies. Read-only
phasing groups leave those fields empty.
"""

__slots__ = [
Expand All @@ -43,6 +43,9 @@ class PhaseGroup(ValueObject):
"mutant_protein_sequences",
"transcript_ids",
"transcript_names",
"known_somatic_transcript_edits",
"known_germline_transcript_edits",
"unexplained_transcript_edits",
]

def __init__(
Expand All @@ -53,11 +56,19 @@ def __init__(
cdna_sequences=(),
mutant_protein_sequences=(),
transcript_ids=(),
transcript_names=()):
transcript_names=(),
known_somatic_transcript_edits=(),
known_germline_transcript_edits=(),
unexplained_transcript_edits=()):
self.somatic_variants = tuple(somatic_variants)
self.germline_variants = tuple(germline_variants)
self.supporting_read_names = frozenset(supporting_read_names)
self.cdna_sequences = tuple(cdna_sequences)
self.mutant_protein_sequences = tuple(mutant_protein_sequences)
self.transcript_ids = tuple(transcript_ids)
self.transcript_names = tuple(transcript_names)
self.known_somatic_transcript_edits = tuple(
known_somatic_transcript_edits)
self.known_germline_transcript_edits = tuple(
known_germline_transcript_edits)
self.unexplained_transcript_edits = tuple(unexplained_transcript_edits)
46 changes: 46 additions & 0 deletions isovar/phasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from .default_parameters import MIN_SHARED_FRAGMENTS_FOR_PHASING
from .phase_group import PhaseGroup
from .transcript_edit_helpers import transcript_assembly_edit_sort_key


def _variant_sort_key(variant):
Expand Down Expand Up @@ -205,21 +206,54 @@ def create_phase_groups(
mutant_protein_sequences = ()
transcript_ids = ()
transcript_names = ()
known_somatic_transcript_edits = ()
known_germline_transcript_edits = ()
unexplained_transcript_edits = ()
else:
cdna_sequences = set()
mutant_protein_sequences = set()
transcript_ids = set()
transcript_names = set()
known_somatic_transcript_edits = set()
known_germline_transcript_edits = set()
unexplained_transcript_edits = set()
for grouped_variant in component:
protein_sequence = variant_to_top_protein_sequence_dict.get(
grouped_variant
)
if protein_sequence is None:
continue
if hasattr(protein_sequence, "_transcript_assembly_edits_by_category"):
categorized_edits = (
protein_sequence._transcript_assembly_edits_by_category())
else:
categorized_edits = {
"known_somatic": getattr(
protein_sequence,
"known_somatic_transcript_edits",
(),
),
"known_germline": getattr(
protein_sequence,
"known_germline_transcript_edits",
(),
),
"unexplained": getattr(
protein_sequence,
"unexplained_transcript_edits",
(),
),
}
cdna_sequences.update(protein_sequence.cdna_sequences)
mutant_protein_sequences.add(protein_sequence.amino_acids)
transcript_ids.update(protein_sequence.transcript_ids)
transcript_names.update(protein_sequence.transcript_names)
known_somatic_transcript_edits.update(
categorized_edits["known_somatic"])
known_germline_transcript_edits.update(
categorized_edits["known_germline"])
unexplained_transcript_edits.update(
categorized_edits["unexplained"])

phase_group = PhaseGroup(
somatic_variants=tuple(sorted(component, key=_variant_sort_key)),
Expand All @@ -229,6 +263,18 @@ def create_phase_groups(
mutant_protein_sequences=tuple(sorted(mutant_protein_sequences)),
transcript_ids=tuple(sorted(transcript_ids)),
transcript_names=tuple(sorted(transcript_names)),
known_somatic_transcript_edits=tuple(sorted(
known_somatic_transcript_edits,
key=transcript_assembly_edit_sort_key,
)),
known_germline_transcript_edits=tuple(sorted(
known_germline_transcript_edits,
key=transcript_assembly_edit_sort_key,
)),
unexplained_transcript_edits=tuple(sorted(
unexplained_transcript_edits,
key=transcript_assembly_edit_sort_key,
)),
)
for grouped_variant in component:
variant_to_phase_group[grouped_variant] = phase_group
Expand Down
52 changes: 52 additions & 0 deletions isovar/protein_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
"""

from .common import normalize_base0_range_indices
from .transcript_edit_helpers import (
categorize_transcript_assembly_edits_from_translation,
transcript_assembly_edit_sort_key,
)
from .translation_key import TranslationKey
from .translation import Translation # noqa: F401
from .logging import get_logger
Expand Down Expand Up @@ -258,6 +262,54 @@ def transcript_ids(self):
for transcript in self.transcripts
]

def _transcript_assembly_edits_by_category(self):
categorized_edits = {
"known_somatic": set(),
"known_germline": set(),
"unexplained": set(),
}
for translation in self.translations:
for transcript in translation.reference_context.transcripts:
translation_edits = (
categorize_transcript_assembly_edits_from_translation(
translation,
transcript,
)
)
for category, edits in translation_edits.items():
categorized_edits[category].update(edits)
return {
category: tuple(sorted(
edits,
key=transcript_assembly_edit_sort_key,
))
for category, edits in categorized_edits.items()
}

@property
def known_somatic_transcript_edits(self):
"""
Transcript-relative edits from known somatic variants observed in the
supporting assemblies for this protein sequence.
"""
return self._transcript_assembly_edits_by_category()["known_somatic"]

@property
def known_germline_transcript_edits(self):
"""
Transcript-relative edits from known germline variants observed in the
supporting assemblies for this protein sequence.
"""
return self._transcript_assembly_edits_by_category()["known_germline"]

@property
def unexplained_transcript_edits(self):
"""
Transcript-relative edits observed in the supporting assemblies which
are not yet matched to a known input variant.
"""
return self._transcript_assembly_edits_by_category()["unexplained"]

@property
def genes(self):
"""
Expand Down
49 changes: 49 additions & 0 deletions isovar/transcript_assembly_edit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Transcript-anchored edit derived from a local Isovar assembly.
"""

from .value_object import ValueObject


class TranscriptAssemblyEdit(ValueObject):
"""
Wrap a TranscriptEdit with the transcript it is relative to.

TranscriptEdit coordinates only make sense for a particular transcript
sequence, so PhaseGroup stores this transcript-anchored form rather than
bare edits.
"""

__slots__ = [
"transcript_id",
"transcript_name",
"edit",
]

@property
def cdna_start(self):
return self.edit.cdna_start

@property
def cdna_end(self):
return self.edit.cdna_end

@property
def alt_bases(self):
return self.edit.alt_bases

@property
def source_variant(self):
return self.edit.source_variant
Loading
Loading