Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion isovar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "1.5.1"
__version__ = "1.5.2"


from .allele_read import AlleleRead
from .dataframe_helpers import isovar_results_to_dataframe
from .isovar_result import IsovarResult
from .locus_read import LocusRead
from .main import run_isovar
from .phase_group import PhaseGroup
from .protein_sequence import ProteinSequence
from .protein_sequence_creator import ProteinSequenceCreator
from .read_collector import ReadCollector
Expand All @@ -32,6 +33,7 @@
"run_isovar",
"isovar_results_to_dataframe",
"AlleleRead",
"PhaseGroup",
"IsovarResult",
"LocusRead",
"ProteinSequence",
Expand Down
29 changes: 24 additions & 5 deletions isovar/isovar_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def __init__(
sorted_protein_sequences=None,
filter_values=None,
phased_variants_in_supporting_reads=None,
phased_variants_in_protein_sequence=None):
phased_variants_in_protein_sequence=None,
phase_group_from_supporting_reads=None,
phase_group_from_protein_sequence=None):
"""
Parameters
----------
Expand All @@ -67,12 +69,24 @@ def __init__(
passed that filter.

phased_variants_in_supporting_reads : set of varcode.Variant
Other somatic variants which occur in the alt reads supporting the
variant associated with this IsovarResult.
Other somatic variants which directly share enough alt-read support
with the variant associated with this IsovarResult.

phased_variants_in_protein_sequence : set of varcode.Variant
Other somatic variants which occur in the reads used to construct
the top protein sequence associated with this IsovarResult.
Other somatic variants which directly share enough protein-
sequence-supporting reads with the variant associated with this
IsovarResult.

phase_group_from_supporting_reads : PhaseGroup or None
Explicit phasing component inferred from all RNA reads supporting
this variant. This group may contain variants which are only
transitively connected through other variants.

phase_group_from_protein_sequence : PhaseGroup or None
Explicit phasing component inferred from reads used to construct the
top protein sequence for this variant. This group may contain
variants which are only transitively connected through other
variants.
"""
self.variant = variant
self.read_evidence = read_evidence
Expand Down Expand Up @@ -100,6 +114,9 @@ def __init__(
self.phased_variants_in_protein_sequence = \
phased_variants_in_protein_sequence

self.phase_group_from_supporting_reads = phase_group_from_supporting_reads
self.phase_group_from_protein_sequence = phase_group_from_protein_sequence

@property
def fields(self):
"""
Expand All @@ -113,6 +130,8 @@ def fields(self):
"filter_values",
"phased_variants_in_supporting_reads",
"phased_variants_in_protein_sequence",
"phase_group_from_supporting_reads",
"phase_group_from_protein_sequence",
]

def __str__(self):
Expand Down
42 changes: 42 additions & 0 deletions isovar/phase_group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Explicit representation of a phased set of variants observed together in RNA.
"""

from .value_object import ValueObject


class PhaseGroup(ValueObject):
"""
A connected set of somatic variants that co-occur in RNA evidence.

This is a group-level object rather than a focal-variant annotation. A
variant can have pairwise phasing support with only a subset of the other
variants in the group, while still belonging to the same connected
component of the phasing graph.

Germline variants are not populated yet, but the field is included so the
public model can grow into that use case without changing shape again.
"""

__slots__ = [
"somatic_variants",
"germline_variants",
"supporting_read_names",
]

def __init__(self, somatic_variants, germline_variants=(), supporting_read_names=()):
self.somatic_variants = tuple(somatic_variants)
self.germline_variants = tuple(germline_variants)
self.supporting_read_names = frozenset(supporting_read_names)
129 changes: 113 additions & 16 deletions isovar/phasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,28 @@
from collections import defaultdict, Counter

from .default_parameters import MIN_SHARED_FRAGMENTS_FOR_PHASING
from .phase_group import PhaseGroup


def _variant_sort_key(variant):
return (
variant.contig,
variant.start,
variant.ref,
variant.alt,
)


def create_read_names_to_variants_dict(variant_to_read_names_dict):
"""
Invert a variant -> read-name mapping into read-name -> variants.
"""
read_names_to_variants = defaultdict(set)

for variant, read_names in variant_to_read_names_dict.items():
for read_name in read_names:
read_names_to_variants[read_name].add(variant)
return read_names_to_variants


def create_variant_to_alt_read_names_dict(isovar_results):
Expand Down Expand Up @@ -72,11 +94,9 @@ def compute_phasing_counts(variant_to_read_names_dict):
-------
Dictionary from variant to Counter(Variant)
"""
read_names_to_variants = defaultdict(set)

for variant, read_names in variant_to_read_names_dict.items():
for read_name in read_names:
read_names_to_variants[read_name].add(variant)
read_names_to_variants = create_read_names_to_variants_dict(
variant_to_read_names_dict
)

# now count up how many reads are shared between pairs of variants
phasing_counts = defaultdict(Counter)
Expand Down Expand Up @@ -110,6 +130,64 @@ def threshold_phased_variant_counts(counts_dict, min_count):
}


def create_phase_groups(
variant_to_read_names_dict,
min_shared_fragments_for_phasing):
"""
Group variants into connected components of the phasing graph.

Returns
-------
dict
Mapping from variant to PhaseGroup. Variants without phased partners are
omitted.
"""
phasing_counts = compute_phasing_counts(variant_to_read_names_dict)
phased_neighbors = {
variant: threshold_phased_variant_counts(
phasing_counts[variant],
min_count=min_shared_fragments_for_phasing)
for variant in variant_to_read_names_dict
}

read_names_to_variants = create_read_names_to_variants_dict(
variant_to_read_names_dict
)

visited = set()
variant_to_phase_group = {}
for variant in sorted(variant_to_read_names_dict, key=_variant_sort_key):
if variant in visited:
continue

component = set()
pending = [variant]
while pending:
current_variant = pending.pop()
if current_variant in component:
continue
component.add(current_variant)
visited.add(current_variant)
pending.extend(phased_neighbors.get(current_variant, set()) - component)

if len(component) <= 1:
continue

supporting_read_names = {
read_name
for read_name, read_variants in read_names_to_variants.items()
if len(component.intersection(read_variants)) >= 2
}
phase_group = PhaseGroup(
somatic_variants=tuple(sorted(component, key=_variant_sort_key)),
germline_variants=(),
supporting_read_names=supporting_read_names,
)
for grouped_variant in component:
variant_to_phase_group[grouped_variant] = phase_group
return variant_to_phase_group


def annotate_phased_variants(
unphased_isovar_results,
min_shared_fragments_for_phasing=MIN_SHARED_FRAGMENTS_FOR_PHASING):
Expand All @@ -129,29 +207,48 @@ def annotate_phased_variants(
list of IsovarResult
"""

# create dictionary counting how often each variant co-occurs with others
# in any reads supporting those variants
phasing_counts_from_supporting_reads = compute_phasing_counts(
create_variant_to_alt_read_names_dict(unphased_isovar_results))
create_variant_to_alt_read_names_dict(unphased_isovar_results)
)

phase_groups_from_supporting_reads = create_phase_groups(
create_variant_to_alt_read_names_dict(unphased_isovar_results),
min_shared_fragments_for_phasing=min_shared_fragments_for_phasing,
)

# create dictionary counting how often each variant co-occurs with others
# in any reads used to construct their protein sequences
phasing_counts_from_protein_sequences = compute_phasing_counts(
create_variant_to_protein_sequence_read_names_dict(
unphased_isovar_results))
unphased_isovar_results)
)

phase_groups_from_protein_sequences = create_phase_groups(
create_variant_to_protein_sequence_read_names_dict(
unphased_isovar_results),
min_shared_fragments_for_phasing=min_shared_fragments_for_phasing,
)

results_with_phasing = []
for isovar_result in unphased_isovar_results:
variant = isovar_result.variant
phase_group_from_supporting_reads = phase_groups_from_supporting_reads.get(
variant
)
phased_variants_in_supporting_reads = threshold_phased_variant_counts(
phasing_counts_from_supporting_reads[variant],
min_count=min_shared_fragments_for_phasing)
phased_variants_in_protein_sequence = \
threshold_phased_variant_counts(
phasing_counts_from_protein_sequences[variant],
min_count=min_shared_fragments_for_phasing)
min_count=min_shared_fragments_for_phasing,
)

phase_group_from_protein_sequence = phase_groups_from_protein_sequences.get(
variant
)
phased_variants_in_protein_sequence = threshold_phased_variant_counts(
phasing_counts_from_protein_sequences[variant],
min_count=min_shared_fragments_for_phasing,
)
results_with_phasing.append(
isovar_result.clone_with_updates(
phase_group_from_supporting_reads=phase_group_from_supporting_reads,
phase_group_from_protein_sequence=phase_group_from_protein_sequence,
phased_variants_in_supporting_reads=phased_variants_in_supporting_reads,
phased_variants_in_protein_sequence=phased_variants_in_protein_sequence))
return results_with_phasing
Loading
Loading