Merge pull request #255 from monarch-initiative/counts_table

Counts table
monarch-initiative · Sep 10, 2024 · a966998 · a966998
2 parents 02c9663 + cd76b11
commit a966998
Show file tree

Hide file tree

Showing 8 changed files with 1,012 additions and 7 deletions.
diff --git a/docs/report/tbx5_all_variants.html b/docs/report/tbx5_all_variants.html
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -103,7 +103,13 @@ We loaded the patient data into a `cohort` which is ready for the next steps.
 Explore cohort
 ^^^^^^^^^^^^^^
 
-We can now explore the cohort to see how many patients are included.
+GPSEA helps with gaining insight into the cohort by providing 
+
+
+Show cohort summary
+-------------------
+
+The summary report provides an overview about the HPO terms, variants, diseases, and variant effects that occurr most frequently:
 
 >>> from gpsea.view import CohortViewable
 >>> viewer = CohortViewable(hpo)
@@ -121,6 +127,10 @@ We can now explore the cohort to see how many patients are included.
     from IPython.display import HTML, display
     display(HTML(report))
 
+
+Plot distribution of variants with respect to the protein sequence
+------------------------------------------------------------------
+
 Now we can show the distribution of variants with respect to the encoded protein.
 We first obtain `tx_coordinates` (:class:`~gpsea.model.TranscriptCoordinates`)
 and `protein_meta` (:class:`~gpsea.model.ProteinMetadata`)
@@ -154,6 +164,27 @@ and we follow with plotting the diagram of the mutations on the protein:
    :width: 600px
 
 
+.. _show-cohort-variants:
+
+Summarize all variant alleles
+-----------------------------
+
+We can prepare a table of all variant alleles that occurr in the cohort.
+Each table row corresponds to a single allele and lists the variant key,
+the predicted effect on the transcript (*cDNA*) and protein of interest,
+the variant effects, and the number of patients who present
+with one or more variant alleles (*Count*):
+
+>>> from gpsea.view import CohortVariantViewer
+>>> viewer = CohortVariantViewer(tx_id=tx_id)
+>>> report = viewer.process(cohort=cohort)
+>>> with open('docs/report/tbx5_all_variants.html', 'w') as fh:  # doctest: +SKIP
+...     _ = fh.write(report)
+
+.. raw:: html
+  :file: report/tbx5_all_variants.html
+
+
 Prepare genotype and phenotype predicates
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/src/gpsea/model/_variant_effects.py b/src/gpsea/model/_variant_effects.py
@@ -1,7 +1,10 @@
-from enum import Enum
+import enum
+import typing
 
+import hpotk
 
-class VariantEffect(Enum):
+
+class VariantEffect(enum.Enum):
     """
     `VariantEffect` represents consequences of a variant on transcript that are supported by GPSEA.
 
@@ -47,7 +50,7 @@ class VariantEffect(Enum):
     THREE_PRIME_UTR_VARIANT = "SO:0001624"
     NON_CODING_TRANSCRIPT_EXON_VARIANT = "SO:0001792"
     INTRON_VARIANT = "SO:0001627"
-    NMD_TRANSCRIPT_VARIANT = "SO:0001621",
+    NMD_TRANSCRIPT_VARIANT = "SO:0001621"
     NON_CODING_TRANSCRIPT_VARIANT = "SO:0001619"
     UPSTREAM_GENE_VARIANT = "SO:0001631"
     DOWNSTREAM_GENE_VARIANT = "SO:0001632"
@@ -62,6 +65,43 @@ class VariantEffect(Enum):
     INTERGENIC_VARIANT = "SO:0001628"
     SEQUENCE_VARIANT = "SO:0001060"
 
+    def to_display(self) -> str:
+        """
+        Get a concise name of the variant effect that is suitable for showing to humans.
+
+        Example
+        ^^^^^^^
+
+        >>> from gpsea.model import VariantEffect
+        >>> VariantEffect.MISSENSE_VARIANT.to_display()
+        'missense'
+        >>> VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT.to_display()
+        'splice donor 5th base'
+
+        :returns: a `str` with the name or `'n/a'` if the variant effect was not assigned a concise name.
+        """
+        return effect_to_display.get(self, "n/a")
+
+    @staticmethod
+    def structural_so_id_to_display(so_term: typing.Union[hpotk.TermId, str]) -> str:
+        """
+        Get a `str` with a concise name for representing a Sequence Ontology (SO) term identifier.
+
+        Example
+        ^^^^^^^
+
+        >>> from gpsea.model import VariantEffect
+        >>> VariantEffect.structural_so_id_to_display('SO:1000029')
+        'chromosomal deletion'
+
+        :param so_term: a CURIE `str` or a :class:`~hpotk.TermId` with the query SO term.
+        :returns: a `str` with the concise name for the SO term or `'n/a'` if a name has not been assigned yet.
+        """
+        if isinstance(so_term, hpotk.TermId):
+            so_term = so_term.value
+
+        return so_id_to_display.get(so_term, "n/a")
+
     def __init__(self, curie: str):
         self._curie = curie
 
@@ -75,3 +115,54 @@ def curie(self) -> str:
 
     def __str__(self) -> str:
         return self.name.lower()
+
+
+effect_to_display = {
+    VariantEffect.TRANSCRIPT_ABLATION: "transcript ablation",
+    VariantEffect.SPLICE_ACCEPTOR_VARIANT: "splice acceptor",
+    VariantEffect.SPLICE_DONOR_VARIANT: "splice donor",
+    VariantEffect.STOP_GAINED: "stop gained",
+    VariantEffect.FRAMESHIFT_VARIANT: "frameshift",
+    VariantEffect.STOP_LOST: "stop lost",
+    VariantEffect.START_LOST: "start lost",
+    VariantEffect.TRANSCRIPT_AMPLIFICATION: "transcript amplification",
+    VariantEffect.INFRAME_INSERTION: "inframe insertion",
+    VariantEffect.INFRAME_DELETION: "inframe deletion",
+    VariantEffect.MISSENSE_VARIANT: "missense",
+    VariantEffect.PROTEIN_ALTERING_VARIANT: "protein altering",
+    VariantEffect.SPLICE_REGION_VARIANT: "splice region",
+    VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT: "splice donor 5th base",
+    VariantEffect.SPLICE_DONOR_REGION_VARIANT: "splice donor",
+    VariantEffect.SPLICE_POLYPYRIMIDINE_TRACT_VARIANT: "splice polypyrimidine",
+    VariantEffect.INCOMPLETE_TERMINAL_CODON_VARIANT: "incomplete terminal codon",
+    VariantEffect.START_RETAINED_VARIANT: "start retained",
+    VariantEffect.STOP_RETAINED_VARIANT: "stop retainined",
+    VariantEffect.SYNONYMOUS_VARIANT: "synonymous",
+    VariantEffect.CODING_SEQUENCE_VARIANT: "coding sequence",
+    VariantEffect.MATURE_MIRNA_VARIANT: "mature miRNA",
+    VariantEffect.FIVE_PRIME_UTR_VARIANT: "5UTR",
+    VariantEffect.THREE_PRIME_UTR_VARIANT: "3UTR",
+    VariantEffect.NON_CODING_TRANSCRIPT_EXON_VARIANT: "non-coding transcript exon",
+    VariantEffect.INTRON_VARIANT: "intronic",
+    VariantEffect.NMD_TRANSCRIPT_VARIANT: "NMD transcript",
+    VariantEffect.NON_CODING_TRANSCRIPT_VARIANT: "non-coding transcript",
+    VariantEffect.UPSTREAM_GENE_VARIANT: "upstream of gene",
+    VariantEffect.DOWNSTREAM_GENE_VARIANT: "downstream of gene",
+    VariantEffect.TFBS_ABLATION: "TFBS ablation",
+    VariantEffect.TFBS_AMPLIFICATION: "TFBS amplification",
+    VariantEffect.TF_BINDING_SITE_VARIANT: "TFBS binding site",
+    VariantEffect.REGULATORY_REGION_ABLATION: "regulatory region ablation",
+    VariantEffect.REGULATORY_REGION_AMPLIFICATION: "regulatory region amplification",
+    VariantEffect.FEATURE_ELONGATION: "feature elongation",
+    VariantEffect.REGULATORY_REGION_VARIANT: "regulatory region",
+    VariantEffect.FEATURE_TRUNCATION: "feature truncation",
+    VariantEffect.INTERGENIC_VARIANT: "intergenic",
+    VariantEffect.SEQUENCE_VARIANT: "sequence variant",
+}
+
+so_id_to_display = {
+    "SO:1000029": "chromosomal deletion",
+    "SO:1000037": "chromosomal duplication",
+    "SO:1000030": "chromosomal_inversion",
+    "SO:1000044": "chromosomal_translocation",
+}
diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py
@@ -1,4 +1,5 @@
 from ._cohort import CohortViewable
+from ._cohort_variant_viewer import CohortVariantViewer
 from ._disease import DiseaseViewable
 from ._phenotype_analysis import summarize_hpo_analysis
 from ._protein_viewer import ProteinViewable
@@ -9,6 +10,7 @@
 from ._formatter import VariantFormatter
 
 __all__ = [
+    'CohortVariantViewer',
     'CohortViewable',
     'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable',
     'DiseaseViewable',

diff --git a/src/gpsea/view/_cohort_variant_viewer.py b/src/gpsea/view/_cohort_variant_viewer.py
@@ -0,0 +1,158 @@
+import typing
+
+from jinja2 import Environment, PackageLoader
+from collections import namedtuple, defaultdict
+
+from gpsea.model import Cohort, Variant, VariantEffect
+from ._formatter import VariantFormatter
+
+
+ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects'])
+
+VariantData = namedtuple('VariantData', ['variant_key', 'hgvs_cdna', 'hgvsp', 'variant_effects'])
+
+
+class CohortVariantViewer:
+    """
+    `CohortVariantViewer` creates an HTML report with the cohort variants.
+
+    The report can be either written into an HTML file or displayed in a Jupyter notebook.
+
+    See :ref:show-cohort-variants: for an example usage.
+    """
+
+    def __init__(
+        self,
+        tx_id: str
+    ):
+        """
+        Args:
+            tx_id (str): The transcript identifier (Usually, the MANE RefSeq transcript, that should start with "NM_")
+        """
+        environment = Environment(loader=PackageLoader('gpsea.view', 'templates'))
+        self._cohort_template = environment.get_template("all_variants.html")
+        self._var_formatter = VariantFormatter(tx_id)
+        if not tx_id.startswith("NM"):
+            print(f"[WARNING] Non-RefSeq transcript id: {tx_id}")
+        self._transcript_id = tx_id
+
+    def process(
+        self,
+        cohort: Cohort,
+        only_hgvs: bool = True
+    ) -> str:
+        """
+        Create an HTML that should be shown with ``display(HTML(..))`` of the ipython package.
+
+        Args:
+            cohort (Cohort): The cohort being analyzed in the current notebook.
+            only_hgvs (bool): Do not show the transcript ID part of the HGVS annotation, just the annotation.
+
+        Returns:
+            str: an HTML string with parameterized template for rendering
+        """
+        context = self._prepare_context(cohort, only_hgvs=only_hgvs)
+        return self._cohort_template.render(context)
+
+    def _prepare_context(
+        self,
+        cohort: Cohort,
+        only_hgvs: bool,
+    ) -> typing.Mapping[str, typing.Any]:
+        variant_count_dictionary = defaultdict(int)
+        variant_effect_count_dictionary = defaultdict(int)
+        variant_key_to_variant = dict()
+        for var in cohort.all_variants():
+            var_key = var.variant_info.variant_key
+            vdata = self._get_variant_data(var, only_hgvs)
+            variant_key_to_variant[var_key] = vdata
+            variant_count_dictionary[var_key] += 1
+            for v_eff in vdata.variant_effects:
+                variant_effect_count_dictionary[v_eff] += 1
+        variant_counts = list()
+        variant_effect_counts = list()
+        for var_key, count in variant_count_dictionary.items():
+            var_data = variant_key_to_variant[var_key]
+            variant_counts.append(
+                {
+                    "variant_key": var_data.variant_key,
+                    "variant": var_data.hgvs_cdna,
+                    "variant_name": var_data.hgvs_cdna,
+                    "protein_name": var_data.hgvsp,
+                    "variant_effects": ", ".join(var_data.variant_effects),
+                    "count": count,
+                }
+            )
+        for v_effect, count in variant_effect_count_dictionary.items():
+            variant_effect_counts.append(
+                {
+                    "effect": v_effect,
+                    "count": count
+                }
+            )
+        variant_counts = sorted(variant_counts, key=lambda row: row.get("count"), reverse=True)
+        variant_effect_counts = sorted(variant_effect_counts, key=lambda row: row.get("count"), reverse=True)
+
+        # The following dictionary is used by the Jinja2 HTML template
+        return {
+            "has_transcript": False,
+            "variant_count_list": variant_counts,
+            "variant_effect_count_list": variant_effect_counts,
+            "total_unique_allele_count": len(variant_counts)
+        }
+
+    def _get_variant_data(
+        self,
+        variant: Variant,
+        only_hgvs: bool
+    ) -> VariantData:
+        """
+        Get user-friendly strings (e.g., HGVS for our target transcript) to match to the chromosomal strings
+        Args:
+            variant (Variant): The variant to be formatted.
+            only_hgvs (bool): do not show the transcript ID part of the HGVS annotation, just the annotation.
+
+        Returns:
+            VariantData: a named tuple with variant data formatted for human consumption.
+        """
+        variant_key = variant.variant_info.variant_key
+        if variant.variant_info.has_sv_info():
+            sv_info = variant.variant_info.sv_info
+            gene_symbol = sv_info.gene_symbol
+            display = f"SV involving {gene_symbol}"
+            effect = VariantEffect.structural_so_id_to_display(so_term=sv_info.structural_type)
+            return VariantData(
+                variant_key=variant_key,
+                hgvs_cdna=display,
+                hgvsp="p.?",
+                variant_effects=[effect],
+            )
+        else:
+            variant_key = variant.variant_info.variant_key
+            display = self._var_formatter.format_as_string(variant)
+            tx_annotation = variant.get_tx_anno_by_tx_id(self._transcript_id)
+            if tx_annotation is not None:
+                hgvsp = tx_annotation.hgvsp
+                var_effects = [var_eff.to_display() for var_eff in tx_annotation.variant_effects]
+            else:
+                hgvsp = None
+                var_effects = []
+            if only_hgvs:
+                # do not show the transcript id
+                fields_dna = display.split(":")
+                if len(fields_dna) > 1:
+                    display_hgvs_cDNA = fields_dna[1]
+                else:
+                    display_hgvs_cDNA = fields_dna[0]
+
+                fields_ps = hgvsp.split(":") if hgvsp is not None else (None,)
+                if len(fields_ps) > 1:
+                    hgvsp = fields_ps[1]
+                else:
+                    hgvsp = fields_ps[0]
+            return VariantData(
+                variant_key=variant_key,
+                hgvs_cdna=display_hgvs_cDNA,
+                hgvsp=hgvsp,
+                variant_effects=var_effects,
+            )