Skip to content

Commit

Permalink
Merge pull request #255 from monarch-initiative/counts_table
Browse files Browse the repository at this point in the history
Counts table
  • Loading branch information
ielis authored Sep 10, 2024
2 parents 02c9663 + cd76b11 commit a966998
Show file tree
Hide file tree
Showing 8 changed files with 1,012 additions and 7 deletions.
583 changes: 583 additions & 0 deletions docs/report/tbx5_all_variants.html

Large diffs are not rendered by default.

33 changes: 32 additions & 1 deletion docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,13 @@ We loaded the patient data into a `cohort` which is ready for the next steps.
Explore cohort
^^^^^^^^^^^^^^

We can now explore the cohort to see how many patients are included.
GPSEA helps with gaining insight into the cohort by providing


Show cohort summary
-------------------

The summary report provides an overview about the HPO terms, variants, diseases, and variant effects that occurr most frequently:

>>> from gpsea.view import CohortViewable
>>> viewer = CohortViewable(hpo)
Expand All @@ -121,6 +127,10 @@ We can now explore the cohort to see how many patients are included.
from IPython.display import HTML, display
display(HTML(report))


Plot distribution of variants with respect to the protein sequence
------------------------------------------------------------------

Now we can show the distribution of variants with respect to the encoded protein.
We first obtain `tx_coordinates` (:class:`~gpsea.model.TranscriptCoordinates`)
and `protein_meta` (:class:`~gpsea.model.ProteinMetadata`)
Expand Down Expand Up @@ -154,6 +164,27 @@ and we follow with plotting the diagram of the mutations on the protein:
:width: 600px


.. _show-cohort-variants:

Summarize all variant alleles
-----------------------------

We can prepare a table of all variant alleles that occurr in the cohort.
Each table row corresponds to a single allele and lists the variant key,
the predicted effect on the transcript (*cDNA*) and protein of interest,
the variant effects, and the number of patients who present
with one or more variant alleles (*Count*):

>>> from gpsea.view import CohortVariantViewer
>>> viewer = CohortVariantViewer(tx_id=tx_id)
>>> report = viewer.process(cohort=cohort)
>>> with open('docs/report/tbx5_all_variants.html', 'w') as fh: # doctest: +SKIP
... _ = fh.write(report)

.. raw:: html
:file: report/tbx5_all_variants.html


Prepare genotype and phenotype predicates
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
97 changes: 94 additions & 3 deletions src/gpsea/model/_variant_effects.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from enum import Enum
import enum
import typing

import hpotk

class VariantEffect(Enum):

class VariantEffect(enum.Enum):
"""
`VariantEffect` represents consequences of a variant on transcript that are supported by GPSEA.
Expand Down Expand Up @@ -47,7 +50,7 @@ class VariantEffect(Enum):
THREE_PRIME_UTR_VARIANT = "SO:0001624"
NON_CODING_TRANSCRIPT_EXON_VARIANT = "SO:0001792"
INTRON_VARIANT = "SO:0001627"
NMD_TRANSCRIPT_VARIANT = "SO:0001621",
NMD_TRANSCRIPT_VARIANT = "SO:0001621"
NON_CODING_TRANSCRIPT_VARIANT = "SO:0001619"
UPSTREAM_GENE_VARIANT = "SO:0001631"
DOWNSTREAM_GENE_VARIANT = "SO:0001632"
Expand All @@ -62,6 +65,43 @@ class VariantEffect(Enum):
INTERGENIC_VARIANT = "SO:0001628"
SEQUENCE_VARIANT = "SO:0001060"

def to_display(self) -> str:
"""
Get a concise name of the variant effect that is suitable for showing to humans.
Example
^^^^^^^
>>> from gpsea.model import VariantEffect
>>> VariantEffect.MISSENSE_VARIANT.to_display()
'missense'
>>> VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT.to_display()
'splice donor 5th base'
:returns: a `str` with the name or `'n/a'` if the variant effect was not assigned a concise name.
"""
return effect_to_display.get(self, "n/a")

@staticmethod
def structural_so_id_to_display(so_term: typing.Union[hpotk.TermId, str]) -> str:
"""
Get a `str` with a concise name for representing a Sequence Ontology (SO) term identifier.
Example
^^^^^^^
>>> from gpsea.model import VariantEffect
>>> VariantEffect.structural_so_id_to_display('SO:1000029')
'chromosomal deletion'
:param so_term: a CURIE `str` or a :class:`~hpotk.TermId` with the query SO term.
:returns: a `str` with the concise name for the SO term or `'n/a'` if a name has not been assigned yet.
"""
if isinstance(so_term, hpotk.TermId):
so_term = so_term.value

return so_id_to_display.get(so_term, "n/a")

def __init__(self, curie: str):
self._curie = curie

Expand All @@ -75,3 +115,54 @@ def curie(self) -> str:

def __str__(self) -> str:
return self.name.lower()


effect_to_display = {
VariantEffect.TRANSCRIPT_ABLATION: "transcript ablation",
VariantEffect.SPLICE_ACCEPTOR_VARIANT: "splice acceptor",
VariantEffect.SPLICE_DONOR_VARIANT: "splice donor",
VariantEffect.STOP_GAINED: "stop gained",
VariantEffect.FRAMESHIFT_VARIANT: "frameshift",
VariantEffect.STOP_LOST: "stop lost",
VariantEffect.START_LOST: "start lost",
VariantEffect.TRANSCRIPT_AMPLIFICATION: "transcript amplification",
VariantEffect.INFRAME_INSERTION: "inframe insertion",
VariantEffect.INFRAME_DELETION: "inframe deletion",
VariantEffect.MISSENSE_VARIANT: "missense",
VariantEffect.PROTEIN_ALTERING_VARIANT: "protein altering",
VariantEffect.SPLICE_REGION_VARIANT: "splice region",
VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT: "splice donor 5th base",
VariantEffect.SPLICE_DONOR_REGION_VARIANT: "splice donor",
VariantEffect.SPLICE_POLYPYRIMIDINE_TRACT_VARIANT: "splice polypyrimidine",
VariantEffect.INCOMPLETE_TERMINAL_CODON_VARIANT: "incomplete terminal codon",
VariantEffect.START_RETAINED_VARIANT: "start retained",
VariantEffect.STOP_RETAINED_VARIANT: "stop retainined",
VariantEffect.SYNONYMOUS_VARIANT: "synonymous",
VariantEffect.CODING_SEQUENCE_VARIANT: "coding sequence",
VariantEffect.MATURE_MIRNA_VARIANT: "mature miRNA",
VariantEffect.FIVE_PRIME_UTR_VARIANT: "5UTR",
VariantEffect.THREE_PRIME_UTR_VARIANT: "3UTR",
VariantEffect.NON_CODING_TRANSCRIPT_EXON_VARIANT: "non-coding transcript exon",
VariantEffect.INTRON_VARIANT: "intronic",
VariantEffect.NMD_TRANSCRIPT_VARIANT: "NMD transcript",
VariantEffect.NON_CODING_TRANSCRIPT_VARIANT: "non-coding transcript",
VariantEffect.UPSTREAM_GENE_VARIANT: "upstream of gene",
VariantEffect.DOWNSTREAM_GENE_VARIANT: "downstream of gene",
VariantEffect.TFBS_ABLATION: "TFBS ablation",
VariantEffect.TFBS_AMPLIFICATION: "TFBS amplification",
VariantEffect.TF_BINDING_SITE_VARIANT: "TFBS binding site",
VariantEffect.REGULATORY_REGION_ABLATION: "regulatory region ablation",
VariantEffect.REGULATORY_REGION_AMPLIFICATION: "regulatory region amplification",
VariantEffect.FEATURE_ELONGATION: "feature elongation",
VariantEffect.REGULATORY_REGION_VARIANT: "regulatory region",
VariantEffect.FEATURE_TRUNCATION: "feature truncation",
VariantEffect.INTERGENIC_VARIANT: "intergenic",
VariantEffect.SEQUENCE_VARIANT: "sequence variant",
}

so_id_to_display = {
"SO:1000029": "chromosomal deletion",
"SO:1000037": "chromosomal duplication",
"SO:1000030": "chromosomal_inversion",
"SO:1000044": "chromosomal_translocation",
}
2 changes: 2 additions & 0 deletions src/gpsea/view/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ._cohort import CohortViewable
from ._cohort_variant_viewer import CohortVariantViewer
from ._disease import DiseaseViewable
from ._phenotype_analysis import summarize_hpo_analysis
from ._protein_viewer import ProteinViewable
Expand All @@ -9,6 +10,7 @@
from ._formatter import VariantFormatter

__all__ = [
'CohortVariantViewer',
'CohortViewable',
'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable',
'DiseaseViewable',
Expand Down
158 changes: 158 additions & 0 deletions src/gpsea/view/_cohort_variant_viewer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import typing

from jinja2 import Environment, PackageLoader
from collections import namedtuple, defaultdict

from gpsea.model import Cohort, Variant, VariantEffect
from ._formatter import VariantFormatter


ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects'])

VariantData = namedtuple('VariantData', ['variant_key', 'hgvs_cdna', 'hgvsp', 'variant_effects'])


class CohortVariantViewer:
"""
`CohortVariantViewer` creates an HTML report with the cohort variants.
The report can be either written into an HTML file or displayed in a Jupyter notebook.
See :ref:show-cohort-variants: for an example usage.
"""

def __init__(
self,
tx_id: str
):
"""
Args:
tx_id (str): The transcript identifier (Usually, the MANE RefSeq transcript, that should start with "NM_")
"""
environment = Environment(loader=PackageLoader('gpsea.view', 'templates'))
self._cohort_template = environment.get_template("all_variants.html")
self._var_formatter = VariantFormatter(tx_id)
if not tx_id.startswith("NM"):
print(f"[WARNING] Non-RefSeq transcript id: {tx_id}")
self._transcript_id = tx_id

def process(
self,
cohort: Cohort,
only_hgvs: bool = True
) -> str:
"""
Create an HTML that should be shown with ``display(HTML(..))`` of the ipython package.
Args:
cohort (Cohort): The cohort being analyzed in the current notebook.
only_hgvs (bool): Do not show the transcript ID part of the HGVS annotation, just the annotation.
Returns:
str: an HTML string with parameterized template for rendering
"""
context = self._prepare_context(cohort, only_hgvs=only_hgvs)
return self._cohort_template.render(context)

def _prepare_context(
self,
cohort: Cohort,
only_hgvs: bool,
) -> typing.Mapping[str, typing.Any]:
variant_count_dictionary = defaultdict(int)
variant_effect_count_dictionary = defaultdict(int)
variant_key_to_variant = dict()
for var in cohort.all_variants():
var_key = var.variant_info.variant_key
vdata = self._get_variant_data(var, only_hgvs)
variant_key_to_variant[var_key] = vdata
variant_count_dictionary[var_key] += 1
for v_eff in vdata.variant_effects:
variant_effect_count_dictionary[v_eff] += 1
variant_counts = list()
variant_effect_counts = list()
for var_key, count in variant_count_dictionary.items():
var_data = variant_key_to_variant[var_key]
variant_counts.append(
{
"variant_key": var_data.variant_key,
"variant": var_data.hgvs_cdna,
"variant_name": var_data.hgvs_cdna,
"protein_name": var_data.hgvsp,
"variant_effects": ", ".join(var_data.variant_effects),
"count": count,
}
)
for v_effect, count in variant_effect_count_dictionary.items():
variant_effect_counts.append(
{
"effect": v_effect,
"count": count
}
)
variant_counts = sorted(variant_counts, key=lambda row: row.get("count"), reverse=True)
variant_effect_counts = sorted(variant_effect_counts, key=lambda row: row.get("count"), reverse=True)

# The following dictionary is used by the Jinja2 HTML template
return {
"has_transcript": False,
"variant_count_list": variant_counts,
"variant_effect_count_list": variant_effect_counts,
"total_unique_allele_count": len(variant_counts)
}

def _get_variant_data(
self,
variant: Variant,
only_hgvs: bool
) -> VariantData:
"""
Get user-friendly strings (e.g., HGVS for our target transcript) to match to the chromosomal strings
Args:
variant (Variant): The variant to be formatted.
only_hgvs (bool): do not show the transcript ID part of the HGVS annotation, just the annotation.
Returns:
VariantData: a named tuple with variant data formatted for human consumption.
"""
variant_key = variant.variant_info.variant_key
if variant.variant_info.has_sv_info():
sv_info = variant.variant_info.sv_info
gene_symbol = sv_info.gene_symbol
display = f"SV involving {gene_symbol}"
effect = VariantEffect.structural_so_id_to_display(so_term=sv_info.structural_type)
return VariantData(
variant_key=variant_key,
hgvs_cdna=display,
hgvsp="p.?",
variant_effects=[effect],
)
else:
variant_key = variant.variant_info.variant_key
display = self._var_formatter.format_as_string(variant)
tx_annotation = variant.get_tx_anno_by_tx_id(self._transcript_id)
if tx_annotation is not None:
hgvsp = tx_annotation.hgvsp
var_effects = [var_eff.to_display() for var_eff in tx_annotation.variant_effects]
else:
hgvsp = None
var_effects = []
if only_hgvs:
# do not show the transcript id
fields_dna = display.split(":")
if len(fields_dna) > 1:
display_hgvs_cDNA = fields_dna[1]
else:
display_hgvs_cDNA = fields_dna[0]

fields_ps = hgvsp.split(":") if hgvsp is not None else (None,)
if len(fields_ps) > 1:
hgvsp = fields_ps[1]
else:
hgvsp = fields_ps[0]
return VariantData(
variant_key=variant_key,
hgvs_cdna=display_hgvs_cDNA,
hgvsp=hgvsp,
variant_effects=var_effects,
)
Loading

0 comments on commit a966998

Please sign in to comment.