diff --git a/isovar/__init__.py b/isovar/__init__.py index 4e594c8..d89ef68 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.5.2" +__version__ = "1.5.3" from .allele_read import AlleleRead diff --git a/isovar/isovar_result.py b/isovar/isovar_result.py index dc564d2..d1385c3 100644 --- a/isovar/isovar_result.py +++ b/isovar/isovar_result.py @@ -80,13 +80,16 @@ def __init__( phase_group_from_supporting_reads : PhaseGroup or None Explicit phasing component inferred from all RNA reads supporting this variant. This group may contain variants which are only - transitively connected through other variants. + transitively connected through other variants. Read-only groups do + not necessarily carry assembled cDNA or transcript metadata. phase_group_from_protein_sequence : PhaseGroup or None Explicit phasing component inferred from reads used to construct the top protein sequence for this variant. This group may contain variants which are only transitively connected through other - variants. + variants. When available, this group also carries directly observed + cDNA, protein, and transcript metadata from the assembled protein + sequences in the group. """ self.variant = variant self.read_evidence = read_evidence diff --git a/isovar/phase_group.py b/isovar/phase_group.py index 68537e3..10317a1 100644 --- a/isovar/phase_group.py +++ b/isovar/phase_group.py @@ -28,15 +28,36 @@ class PhaseGroup(ValueObject): Germline variants are not populated yet, but the field is included so the public model can grow into that use case without changing shape again. + + Some phase groups are backed by translated Isovar assemblies. In those + cases the group can also carry directly observed cDNA, protein, and + transcript metadata from the supporting assemblies. Read-only phasing groups + leave those fields empty. """ __slots__ = [ "somatic_variants", "germline_variants", "supporting_read_names", + "cdna_sequences", + "mutant_protein_sequences", + "transcript_ids", + "transcript_names", ] - def __init__(self, somatic_variants, germline_variants=(), supporting_read_names=()): + def __init__( + self, + somatic_variants, + germline_variants=(), + supporting_read_names=(), + cdna_sequences=(), + mutant_protein_sequences=(), + transcript_ids=(), + transcript_names=()): self.somatic_variants = tuple(somatic_variants) self.germline_variants = tuple(germline_variants) self.supporting_read_names = frozenset(supporting_read_names) + self.cdna_sequences = tuple(cdna_sequences) + self.mutant_protein_sequences = tuple(mutant_protein_sequences) + self.transcript_ids = tuple(transcript_ids) + self.transcript_names = tuple(transcript_names) diff --git a/isovar/phasing.py b/isovar/phasing.py index 21f4a54..0243f6c 100644 --- a/isovar/phasing.py +++ b/isovar/phasing.py @@ -82,6 +82,22 @@ def create_variant_to_protein_sequence_read_names_dict(isovar_results): return variant_to_read_names +def create_variant_to_top_protein_sequence_dict(isovar_results): + """ + Create dictionary from variant to its top translated protein sequence. + + Variants without an assembled protein sequence are mapped to None. + """ + return { + isovar_result.variant: ( + isovar_result.top_protein_sequence + if isovar_result.has_mutant_protein_sequence_from_rna + else None + ) + for isovar_result in isovar_results + } + + def compute_phasing_counts(variant_to_read_names_dict): """ @@ -132,10 +148,15 @@ def threshold_phased_variant_counts(counts_dict, min_count): def create_phase_groups( variant_to_read_names_dict, - min_shared_fragments_for_phasing): + min_shared_fragments_for_phasing, + variant_to_top_protein_sequence_dict=None): """ Group variants into connected components of the phasing graph. + If top translated protein sequences are provided then each resulting + PhaseGroup is also annotated with directly observed cDNA, protein, and + transcript metadata from those assemblies. + Returns ------- dict @@ -178,10 +199,36 @@ def create_phase_groups( for read_name, read_variants in read_names_to_variants.items() if len(component.intersection(read_variants)) >= 2 } + + if variant_to_top_protein_sequence_dict is None: + cdna_sequences = () + mutant_protein_sequences = () + transcript_ids = () + transcript_names = () + else: + cdna_sequences = set() + mutant_protein_sequences = set() + transcript_ids = set() + transcript_names = set() + for grouped_variant in component: + protein_sequence = variant_to_top_protein_sequence_dict.get( + grouped_variant + ) + if protein_sequence is None: + continue + cdna_sequences.update(protein_sequence.cdna_sequences) + mutant_protein_sequences.add(protein_sequence.amino_acids) + transcript_ids.update(protein_sequence.transcript_ids) + transcript_names.update(protein_sequence.transcript_names) + phase_group = PhaseGroup( somatic_variants=tuple(sorted(component, key=_variant_sort_key)), germline_variants=(), supporting_read_names=supporting_read_names, + cdna_sequences=tuple(sorted(cdna_sequences)), + mutant_protein_sequences=tuple(sorted(mutant_protein_sequences)), + transcript_ids=tuple(sorted(transcript_ids)), + transcript_names=tuple(sorted(transcript_names)), ) for grouped_variant in component: variant_to_phase_group[grouped_variant] = phase_group @@ -225,6 +272,8 @@ def annotate_phased_variants( create_variant_to_protein_sequence_read_names_dict( unphased_isovar_results), min_shared_fragments_for_phasing=min_shared_fragments_for_phasing, + variant_to_top_protein_sequence_dict=create_variant_to_top_protein_sequence_dict( + unphased_isovar_results), ) results_with_phasing = [] diff --git a/tests/test_phasing.py b/tests/test_phasing.py index 4d2747f..d18071f 100644 --- a/tests/test_phasing.py +++ b/tests/test_phasing.py @@ -21,11 +21,28 @@ class DummyProteinSequence(object): - def __init__(self, read_names): + def __init__( + self, + read_names, + amino_acids, + cdna_sequences, + transcript_ids, + transcript_names): self.read_names_supporting_protein_sequence = set(read_names) - - -def make_isovar_result(variant, alt_read_names, protein_read_names): + self.amino_acids = amino_acids + self.cdna_sequences = set(cdna_sequences) + self.transcript_ids = list(transcript_ids) + self.transcript_names = list(transcript_names) + + +def make_isovar_result( + variant, + alt_read_names, + protein_read_names, + amino_acids=None, + cdna_sequences=(), + transcript_ids=(), + transcript_names=()): read_evidence = ReadEvidence( trimmed_base1_start=variant.start, trimmed_ref=variant.ref, @@ -41,7 +58,13 @@ def make_isovar_result(variant, alt_read_names, protein_read_names): variant=variant, read_evidence=read_evidence, predicted_effect=None, - sorted_protein_sequences=[DummyProteinSequence(protein_read_names)], + sorted_protein_sequences=[DummyProteinSequence( + read_names=protein_read_names, + amino_acids=amino_acids if amino_acids is not None else variant.alt, + cdna_sequences=cdna_sequences, + transcript_ids=transcript_ids, + transcript_names=transcript_names, + )], ) @@ -52,9 +75,33 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): results = annotate_phased_variants( [ - make_isovar_result(v1, {"r12"}, {"p12"}), - make_isovar_result(v2, {"r12", "r23"}, {"p12", "p23"}), - make_isovar_result(v3, {"r23"}, {"p23"}), + make_isovar_result( + v1, + {"r12"}, + {"p12"}, + amino_acids="AA-1", + cdna_sequences={"cdna-1"}, + transcript_ids={"tx-1", "tx-shared"}, + transcript_names={"TX1", "TX_SHARED"}, + ), + make_isovar_result( + v2, + {"r12", "r23"}, + {"p12", "p23"}, + amino_acids="AA-2", + cdna_sequences={"cdna-2"}, + transcript_ids={"tx-2", "tx-shared"}, + transcript_names={"TX2", "TX_SHARED"}, + ), + make_isovar_result( + v3, + {"r23"}, + {"p23"}, + amino_acids="AA-3", + cdna_sequences={"cdna-3"}, + transcript_ids={"tx-2"}, + transcript_names={"TX2"}, + ), ], min_shared_fragments_for_phasing=1, ) @@ -63,6 +110,10 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): expected_group_variants = (v1, v2, v3) expected_phase_read_names = frozenset({"r12", "r23"}) expected_phase_protein_read_names = frozenset({"p12", "p23"}) + expected_cdna_sequences = ("cdna-1", "cdna-2", "cdna-3") + expected_protein_sequences = ("AA-1", "AA-2", "AA-3") + expected_transcript_ids = ("tx-1", "tx-2", "tx-shared") + expected_transcript_names = ("TX1", "TX2", "TX_SHARED") expected_supporting_neighbors = { v1: {v2}, v2: {v1, v3}, @@ -79,10 +130,18 @@ def test_annotate_phased_variants_creates_explicit_phase_groups(): eq_(supporting_group.somatic_variants, expected_group_variants) eq_(supporting_group.germline_variants, ()) eq_(supporting_group.supporting_read_names, expected_phase_read_names) + eq_(supporting_group.cdna_sequences, ()) + eq_(supporting_group.mutant_protein_sequences, ()) + eq_(supporting_group.transcript_ids, ()) + eq_(supporting_group.transcript_names, ()) eq_(protein_group.somatic_variants, expected_group_variants) eq_(protein_group.germline_variants, ()) eq_(protein_group.supporting_read_names, expected_phase_protein_read_names) + eq_(protein_group.cdna_sequences, expected_cdna_sequences) + eq_(protein_group.mutant_protein_sequences, expected_protein_sequences) + eq_(protein_group.transcript_ids, expected_transcript_ids) + eq_(protein_group.transcript_names, expected_transcript_names) eq_( result.phased_variants_in_supporting_reads,