Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion isovar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "1.5.2"
__version__ = "1.5.3"


from .allele_read import AlleleRead
Expand Down
7 changes: 5 additions & 2 deletions isovar/isovar_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,16 @@ def __init__(
phase_group_from_supporting_reads : PhaseGroup or None
Explicit phasing component inferred from all RNA reads supporting
this variant. This group may contain variants which are only
transitively connected through other variants.
transitively connected through other variants. Read-only groups do
not necessarily carry assembled cDNA or transcript metadata.

phase_group_from_protein_sequence : PhaseGroup or None
Explicit phasing component inferred from reads used to construct the
top protein sequence for this variant. This group may contain
variants which are only transitively connected through other
variants.
variants. When available, this group also carries directly observed
cDNA, protein, and transcript metadata from the assembled protein
sequences in the group.
"""
self.variant = variant
self.read_evidence = read_evidence
Expand Down
23 changes: 22 additions & 1 deletion isovar/phase_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,36 @@ class PhaseGroup(ValueObject):

Germline variants are not populated yet, but the field is included so the
public model can grow into that use case without changing shape again.

Some phase groups are backed by translated Isovar assemblies. In those
cases the group can also carry directly observed cDNA, protein, and
transcript metadata from the supporting assemblies. Read-only phasing groups
leave those fields empty.
"""

__slots__ = [
"somatic_variants",
"germline_variants",
"supporting_read_names",
"cdna_sequences",
"mutant_protein_sequences",
"transcript_ids",
"transcript_names",
]

def __init__(self, somatic_variants, germline_variants=(), supporting_read_names=()):
def __init__(
self,
somatic_variants,
germline_variants=(),
supporting_read_names=(),
cdna_sequences=(),
mutant_protein_sequences=(),
transcript_ids=(),
transcript_names=()):
self.somatic_variants = tuple(somatic_variants)
self.germline_variants = tuple(germline_variants)
self.supporting_read_names = frozenset(supporting_read_names)
self.cdna_sequences = tuple(cdna_sequences)
self.mutant_protein_sequences = tuple(mutant_protein_sequences)
self.transcript_ids = tuple(transcript_ids)
self.transcript_names = tuple(transcript_names)
51 changes: 50 additions & 1 deletion isovar/phasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,22 @@ def create_variant_to_protein_sequence_read_names_dict(isovar_results):
return variant_to_read_names


def create_variant_to_top_protein_sequence_dict(isovar_results):
"""
Create dictionary from variant to its top translated protein sequence.

Variants without an assembled protein sequence are mapped to None.
"""
return {
isovar_result.variant: (
isovar_result.top_protein_sequence
if isovar_result.has_mutant_protein_sequence_from_rna
else None
)
for isovar_result in isovar_results
}


def compute_phasing_counts(variant_to_read_names_dict):
"""

Expand Down Expand Up @@ -132,10 +148,15 @@ def threshold_phased_variant_counts(counts_dict, min_count):

def create_phase_groups(
variant_to_read_names_dict,
min_shared_fragments_for_phasing):
min_shared_fragments_for_phasing,
variant_to_top_protein_sequence_dict=None):
"""
Group variants into connected components of the phasing graph.

If top translated protein sequences are provided then each resulting
PhaseGroup is also annotated with directly observed cDNA, protein, and
transcript metadata from those assemblies.

Returns
-------
dict
Expand Down Expand Up @@ -178,10 +199,36 @@ def create_phase_groups(
for read_name, read_variants in read_names_to_variants.items()
if len(component.intersection(read_variants)) >= 2
}

if variant_to_top_protein_sequence_dict is None:
cdna_sequences = ()
mutant_protein_sequences = ()
transcript_ids = ()
transcript_names = ()
else:
cdna_sequences = set()
mutant_protein_sequences = set()
transcript_ids = set()
transcript_names = set()
for grouped_variant in component:
protein_sequence = variant_to_top_protein_sequence_dict.get(
grouped_variant
)
if protein_sequence is None:
continue
cdna_sequences.update(protein_sequence.cdna_sequences)
mutant_protein_sequences.add(protein_sequence.amino_acids)
transcript_ids.update(protein_sequence.transcript_ids)
transcript_names.update(protein_sequence.transcript_names)

phase_group = PhaseGroup(
somatic_variants=tuple(sorted(component, key=_variant_sort_key)),
germline_variants=(),
supporting_read_names=supporting_read_names,
cdna_sequences=tuple(sorted(cdna_sequences)),
mutant_protein_sequences=tuple(sorted(mutant_protein_sequences)),
transcript_ids=tuple(sorted(transcript_ids)),
transcript_names=tuple(sorted(transcript_names)),
)
for grouped_variant in component:
variant_to_phase_group[grouped_variant] = phase_group
Expand Down Expand Up @@ -225,6 +272,8 @@ def annotate_phased_variants(
create_variant_to_protein_sequence_read_names_dict(
unphased_isovar_results),
min_shared_fragments_for_phasing=min_shared_fragments_for_phasing,
variant_to_top_protein_sequence_dict=create_variant_to_top_protein_sequence_dict(
unphased_isovar_results),
)

results_with_phasing = []
Expand Down
75 changes: 67 additions & 8 deletions tests/test_phasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,28 @@


class DummyProteinSequence(object):
def __init__(self, read_names):
def __init__(
self,
read_names,
amino_acids,
cdna_sequences,
transcript_ids,
transcript_names):
self.read_names_supporting_protein_sequence = set(read_names)


def make_isovar_result(variant, alt_read_names, protein_read_names):
self.amino_acids = amino_acids
self.cdna_sequences = set(cdna_sequences)
self.transcript_ids = list(transcript_ids)
self.transcript_names = list(transcript_names)


def make_isovar_result(
variant,
alt_read_names,
protein_read_names,
amino_acids=None,
cdna_sequences=(),
transcript_ids=(),
transcript_names=()):
read_evidence = ReadEvidence(
trimmed_base1_start=variant.start,
trimmed_ref=variant.ref,
Expand All @@ -41,7 +58,13 @@ def make_isovar_result(variant, alt_read_names, protein_read_names):
variant=variant,
read_evidence=read_evidence,
predicted_effect=None,
sorted_protein_sequences=[DummyProteinSequence(protein_read_names)],
sorted_protein_sequences=[DummyProteinSequence(
read_names=protein_read_names,
amino_acids=amino_acids if amino_acids is not None else variant.alt,
cdna_sequences=cdna_sequences,
transcript_ids=transcript_ids,
transcript_names=transcript_names,
)],
)


Expand All @@ -52,9 +75,33 @@ def test_annotate_phased_variants_creates_explicit_phase_groups():

results = annotate_phased_variants(
[
make_isovar_result(v1, {"r12"}, {"p12"}),
make_isovar_result(v2, {"r12", "r23"}, {"p12", "p23"}),
make_isovar_result(v3, {"r23"}, {"p23"}),
make_isovar_result(
v1,
{"r12"},
{"p12"},
amino_acids="AA-1",
cdna_sequences={"cdna-1"},
transcript_ids={"tx-1", "tx-shared"},
transcript_names={"TX1", "TX_SHARED"},
),
make_isovar_result(
v2,
{"r12", "r23"},
{"p12", "p23"},
amino_acids="AA-2",
cdna_sequences={"cdna-2"},
transcript_ids={"tx-2", "tx-shared"},
transcript_names={"TX2", "TX_SHARED"},
),
make_isovar_result(
v3,
{"r23"},
{"p23"},
amino_acids="AA-3",
cdna_sequences={"cdna-3"},
transcript_ids={"tx-2"},
transcript_names={"TX2"},
),
],
min_shared_fragments_for_phasing=1,
)
Expand All @@ -63,6 +110,10 @@ def test_annotate_phased_variants_creates_explicit_phase_groups():
expected_group_variants = (v1, v2, v3)
expected_phase_read_names = frozenset({"r12", "r23"})
expected_phase_protein_read_names = frozenset({"p12", "p23"})
expected_cdna_sequences = ("cdna-1", "cdna-2", "cdna-3")
expected_protein_sequences = ("AA-1", "AA-2", "AA-3")
expected_transcript_ids = ("tx-1", "tx-2", "tx-shared")
expected_transcript_names = ("TX1", "TX2", "TX_SHARED")
expected_supporting_neighbors = {
v1: {v2},
v2: {v1, v3},
Expand All @@ -79,10 +130,18 @@ def test_annotate_phased_variants_creates_explicit_phase_groups():
eq_(supporting_group.somatic_variants, expected_group_variants)
eq_(supporting_group.germline_variants, ())
eq_(supporting_group.supporting_read_names, expected_phase_read_names)
eq_(supporting_group.cdna_sequences, ())
eq_(supporting_group.mutant_protein_sequences, ())
eq_(supporting_group.transcript_ids, ())
eq_(supporting_group.transcript_names, ())

eq_(protein_group.somatic_variants, expected_group_variants)
eq_(protein_group.germline_variants, ())
eq_(protein_group.supporting_read_names, expected_phase_protein_read_names)
eq_(protein_group.cdna_sequences, expected_cdna_sequences)
eq_(protein_group.mutant_protein_sequences, expected_protein_sequences)
eq_(protein_group.transcript_ids, expected_transcript_ids)
eq_(protein_group.transcript_names, expected_transcript_names)

eq_(
result.phased_variants_in_supporting_reads,
Expand Down
Loading