diff --git a/isovar/__init__.py b/isovar/__init__.py index 1a1bd83..4e594c8 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.5.1" +__version__ = "1.5.2" from .allele_read import AlleleRead @@ -18,6 +18,7 @@ from .isovar_result import IsovarResult from .locus_read import LocusRead from .main import run_isovar +from .phase_group import PhaseGroup from .protein_sequence import ProteinSequence from .protein_sequence_creator import ProteinSequenceCreator from .read_collector import ReadCollector @@ -32,6 +33,7 @@ "run_isovar", "isovar_results_to_dataframe", "AlleleRead", + "PhaseGroup", "IsovarResult", "LocusRead", "ProteinSequence", diff --git a/isovar/isovar_result.py b/isovar/isovar_result.py index 4fcb77e..dc564d2 100644 --- a/isovar/isovar_result.py +++ b/isovar/isovar_result.py @@ -46,7 +46,9 @@ def __init__( sorted_protein_sequences=None, filter_values=None, phased_variants_in_supporting_reads=None, - phased_variants_in_protein_sequence=None): + phased_variants_in_protein_sequence=None, + phase_group_from_supporting_reads=None, + phase_group_from_protein_sequence=None): """ Parameters ---------- @@ -67,12 +69,24 @@ def __init__( passed that filter. phased_variants_in_supporting_reads : set of varcode.Variant - Other somatic variants which occur in the alt reads supporting the - variant associated with this IsovarResult. + Other somatic variants which directly share enough alt-read support + with the variant associated with this IsovarResult. phased_variants_in_protein_sequence : set of varcode.Variant - Other somatic variants which occur in the reads used to construct - the top protein sequence associated with this IsovarResult. + Other somatic variants which directly share enough protein- + sequence-supporting reads with the variant associated with this + IsovarResult. + + phase_group_from_supporting_reads : PhaseGroup or None + Explicit phasing component inferred from all RNA reads supporting + this variant. This group may contain variants which are only + transitively connected through other variants. + + phase_group_from_protein_sequence : PhaseGroup or None + Explicit phasing component inferred from reads used to construct the + top protein sequence for this variant. This group may contain + variants which are only transitively connected through other + variants. """ self.variant = variant self.read_evidence = read_evidence @@ -100,6 +114,9 @@ def __init__( self.phased_variants_in_protein_sequence = \ phased_variants_in_protein_sequence + self.phase_group_from_supporting_reads = phase_group_from_supporting_reads + self.phase_group_from_protein_sequence = phase_group_from_protein_sequence + @property def fields(self): """ @@ -113,6 +130,8 @@ def fields(self): "filter_values", "phased_variants_in_supporting_reads", "phased_variants_in_protein_sequence", + "phase_group_from_supporting_reads", + "phase_group_from_protein_sequence", ] def __str__(self): diff --git a/isovar/phase_group.py b/isovar/phase_group.py new file mode 100644 index 0000000..68537e3 --- /dev/null +++ b/isovar/phase_group.py @@ -0,0 +1,42 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Explicit representation of a phased set of variants observed together in RNA. +""" + +from .value_object import ValueObject + + +class PhaseGroup(ValueObject): + """ + A connected set of somatic variants that co-occur in RNA evidence. + + This is a group-level object rather than a focal-variant annotation. A + variant can have pairwise phasing support with only a subset of the other + variants in the group, while still belonging to the same connected + component of the phasing graph. + + Germline variants are not populated yet, but the field is included so the + public model can grow into that use case without changing shape again. + """ + + __slots__ = [ + "somatic_variants", + "germline_variants", + "supporting_read_names", + ] + + def __init__(self, somatic_variants, germline_variants=(), supporting_read_names=()): + self.somatic_variants = tuple(somatic_variants) + self.germline_variants = tuple(germline_variants) + self.supporting_read_names = frozenset(supporting_read_names) diff --git a/isovar/phasing.py b/isovar/phasing.py index 1bf376f..21f4a54 100644 --- a/isovar/phasing.py +++ b/isovar/phasing.py @@ -13,6 +13,28 @@ from collections import defaultdict, Counter from .default_parameters import MIN_SHARED_FRAGMENTS_FOR_PHASING +from .phase_group import PhaseGroup + + +def _variant_sort_key(variant): + return ( + variant.contig, + variant.start, + variant.ref, + variant.alt, + ) + + +def create_read_names_to_variants_dict(variant_to_read_names_dict): + """ + Invert a variant -> read-name mapping into read-name -> variants. + """ + read_names_to_variants = defaultdict(set) + + for variant, read_names in variant_to_read_names_dict.items(): + for read_name in read_names: + read_names_to_variants[read_name].add(variant) + return read_names_to_variants def create_variant_to_alt_read_names_dict(isovar_results): @@ -72,11 +94,9 @@ def compute_phasing_counts(variant_to_read_names_dict): ------- Dictionary from variant to Counter(Variant) """ - read_names_to_variants = defaultdict(set) - - for variant, read_names in variant_to_read_names_dict.items(): - for read_name in read_names: - read_names_to_variants[read_name].add(variant) + read_names_to_variants = create_read_names_to_variants_dict( + variant_to_read_names_dict + ) # now count up how many reads are shared between pairs of variants phasing_counts = defaultdict(Counter) @@ -110,6 +130,64 @@ def threshold_phased_variant_counts(counts_dict, min_count): } +def create_phase_groups( + variant_to_read_names_dict, + min_shared_fragments_for_phasing): + """ + Group variants into connected components of the phasing graph. + + Returns + ------- + dict + Mapping from variant to PhaseGroup. Variants without phased partners are + omitted. + """ + phasing_counts = compute_phasing_counts(variant_to_read_names_dict) + phased_neighbors = { + variant: threshold_phased_variant_counts( + phasing_counts[variant], + min_count=min_shared_fragments_for_phasing) + for variant in variant_to_read_names_dict + } + + read_names_to_variants = create_read_names_to_variants_dict( + variant_to_read_names_dict + ) + + visited = set() + variant_to_phase_group = {} + for variant in sorted(variant_to_read_names_dict, key=_variant_sort_key): + if variant in visited: + continue + + component = set() + pending = [variant] + while pending: + current_variant = pending.pop() + if current_variant in component: + continue + component.add(current_variant) + visited.add(current_variant) + pending.extend(phased_neighbors.get(current_variant, set()) - component) + + if len(component) <= 1: + continue + + supporting_read_names = { + read_name + for read_name, read_variants in read_names_to_variants.items() + if len(component.intersection(read_variants)) >= 2 + } + phase_group = PhaseGroup( + somatic_variants=tuple(sorted(component, key=_variant_sort_key)), + germline_variants=(), + supporting_read_names=supporting_read_names, + ) + for grouped_variant in component: + variant_to_phase_group[grouped_variant] = phase_group + return variant_to_phase_group + + def annotate_phased_variants( unphased_isovar_results, min_shared_fragments_for_phasing=MIN_SHARED_FRAGMENTS_FOR_PHASING): @@ -129,29 +207,48 @@ def annotate_phased_variants( list of IsovarResult """ - # create dictionary counting how often each variant co-occurs with others - # in any reads supporting those variants phasing_counts_from_supporting_reads = compute_phasing_counts( - create_variant_to_alt_read_names_dict(unphased_isovar_results)) + create_variant_to_alt_read_names_dict(unphased_isovar_results) + ) + + phase_groups_from_supporting_reads = create_phase_groups( + create_variant_to_alt_read_names_dict(unphased_isovar_results), + min_shared_fragments_for_phasing=min_shared_fragments_for_phasing, + ) - # create dictionary counting how often each variant co-occurs with others - # in any reads used to construct their protein sequences phasing_counts_from_protein_sequences = compute_phasing_counts( create_variant_to_protein_sequence_read_names_dict( - unphased_isovar_results)) + unphased_isovar_results) + ) + + phase_groups_from_protein_sequences = create_phase_groups( + create_variant_to_protein_sequence_read_names_dict( + unphased_isovar_results), + min_shared_fragments_for_phasing=min_shared_fragments_for_phasing, + ) results_with_phasing = [] for isovar_result in unphased_isovar_results: variant = isovar_result.variant + phase_group_from_supporting_reads = phase_groups_from_supporting_reads.get( + variant + ) phased_variants_in_supporting_reads = threshold_phased_variant_counts( phasing_counts_from_supporting_reads[variant], - min_count=min_shared_fragments_for_phasing) - phased_variants_in_protein_sequence = \ - threshold_phased_variant_counts( - phasing_counts_from_protein_sequences[variant], - min_count=min_shared_fragments_for_phasing) + min_count=min_shared_fragments_for_phasing, + ) + + phase_group_from_protein_sequence = phase_groups_from_protein_sequences.get( + variant + ) + phased_variants_in_protein_sequence = threshold_phased_variant_counts( + phasing_counts_from_protein_sequences[variant], + min_count=min_shared_fragments_for_phasing, + ) results_with_phasing.append( isovar_result.clone_with_updates( + phase_group_from_supporting_reads=phase_group_from_supporting_reads, + phase_group_from_protein_sequence=phase_group_from_protein_sequence, phased_variants_in_supporting_reads=phased_variants_in_supporting_reads, phased_variants_in_protein_sequence=phased_variants_in_protein_sequence)) return results_with_phasing diff --git a/tests/test_phasing.py b/tests/test_phasing.py new file mode 100644 index 0000000..4d2747f --- /dev/null +++ b/tests/test_phasing.py @@ -0,0 +1,107 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from varcode import Variant + +from isovar.allele_read import AlleleRead +from isovar.isovar_result import IsovarResult +from isovar.phasing import annotate_phased_variants +from isovar.read_evidence import ReadEvidence + +from .common import eq_ + + +class DummyProteinSequence(object): + def __init__(self, read_names): + self.read_names_supporting_protein_sequence = set(read_names) + + +def make_isovar_result(variant, alt_read_names, protein_read_names): + read_evidence = ReadEvidence( + trimmed_base1_start=variant.start, + trimmed_ref=variant.ref, + trimmed_alt=variant.alt, + ref_reads=[], + alt_reads=[ + AlleleRead(prefix="A", allele=variant.alt, suffix="T", name=read_name) + for read_name in alt_read_names + ], + other_reads=[], + ) + return IsovarResult( + variant=variant, + read_evidence=read_evidence, + predicted_effect=None, + sorted_protein_sequences=[DummyProteinSequence(protein_read_names)], + ) + + +def test_annotate_phased_variants_creates_explicit_phase_groups(): + v1 = Variant("1", 10, "A", "C", normalize_contig_names=False) + v2 = Variant("1", 11, "G", "T", normalize_contig_names=False) + v3 = Variant("1", 12, "T", "G", normalize_contig_names=False) + + results = annotate_phased_variants( + [ + make_isovar_result(v1, {"r12"}, {"p12"}), + make_isovar_result(v2, {"r12", "r23"}, {"p12", "p23"}), + make_isovar_result(v3, {"r23"}, {"p23"}), + ], + min_shared_fragments_for_phasing=1, + ) + + results_by_variant = {result.variant: result for result in results} + expected_group_variants = (v1, v2, v3) + expected_phase_read_names = frozenset({"r12", "r23"}) + expected_phase_protein_read_names = frozenset({"p12", "p23"}) + expected_supporting_neighbors = { + v1: {v2}, + v2: {v1, v3}, + v3: {v2}, + } + + for variant, result in results_by_variant.items(): + supporting_group = result.phase_group_from_supporting_reads + protein_group = result.phase_group_from_protein_sequence + + assert supporting_group is not None + assert protein_group is not None + + eq_(supporting_group.somatic_variants, expected_group_variants) + eq_(supporting_group.germline_variants, ()) + eq_(supporting_group.supporting_read_names, expected_phase_read_names) + + eq_(protein_group.somatic_variants, expected_group_variants) + eq_(protein_group.germline_variants, ()) + eq_(protein_group.supporting_read_names, expected_phase_protein_read_names) + + eq_( + result.phased_variants_in_supporting_reads, + expected_supporting_neighbors[variant], + ) + eq_( + result.phased_variants_in_protein_sequence, + expected_supporting_neighbors[variant], + ) + + +def test_annotate_phased_variants_leaves_singletons_without_phase_group(): + variant = Variant("1", 20, "C", "A", normalize_contig_names=False) + result = annotate_phased_variants( + [make_isovar_result(variant, {"solo"}, {"solo-protein"})], + min_shared_fragments_for_phasing=1, + )[0] + + assert result.phase_group_from_supporting_reads is None + assert result.phase_group_from_protein_sequence is None + eq_(result.phased_variants_in_supporting_reads, set()) + eq_(result.phased_variants_in_protein_sequence, set())