Merge pull request #254 from monarch-initiative/improve-summarize

Improve summarize
monarch-initiative · Sep 9, 2024 · 51c439c · 51c439c
2 parents a314f26 + 9e75d23
commit 51c439c
Show file tree

Hide file tree

Showing 12 changed files with 208 additions and 125 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # Cache with transcript/protein pickle files
 .gpsea_cache
+dev/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv
@@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.286867598598
 Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855
 Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713
 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784
-Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
-Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0
 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0
+Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0
 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0
-Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,,
-Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,,
-Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,,
+Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
+Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,,
+Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,,
+Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,,
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -268,8 +268,8 @@ by exploring the phenotype MTC filtering report.
 
 and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure:
 
->>> from gpsea.analysis.predicate import PatientCategories
->>> summary_df = result.summarize(hpo, PatientCategories.YES)
+>>> from gpsea.view import summarize_hpo_analysis
+>>> summary_df = summarize_hpo_analysis(hpo, result)
 >>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv')  # doctest: +SKIP
 
 .. csv-table:: *TBX5* frameshift vs missense

diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst
@@ -254,8 +254,8 @@ Last, let's explore the associations. The results include a table with all teste
 ordered by the corrected p value (Benjamini-Hochberg FDR).
 Here we show the top 20 table rows:
 
->>> from gpsea.analysis.predicate import PatientCategories
->>> summary_df = result.summarize(hpo, PatientCategories.YES)
+>>> from gpsea.view import summarize_hpo_analysis
+>>> summary_df = summarize_hpo_analysis(hpo, result)
 >>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv')  # doctest: +SKIP
 
 .. csv-table:: *TBX5* frameshift vs rest

diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
@@ -13,7 +13,6 @@
 
 from gpsea.model import Patient
 from gpsea.analysis.pcats.stats import CountStatistic
-from ..predicate import PatientCategory
 from ..predicate.genotype import GenotypePolyPredicate
 from ..predicate.phenotype import P, PhenotypePolyPredicate
 from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult
@@ -101,14 +100,6 @@ class MultiPhenotypeAnalysisResult(typing.Generic[P], metaclass=abc.ABCMeta):
     with the order determined by the phenotype order.
     """
 
-    @property
-    @abc.abstractmethod
-    def phenotypes(self) -> typing.Sequence[P]:
-        """
-        Get the tested phenotypes.
-        """
-        pass
-
     @property
     @abc.abstractmethod
     def n_usable(self) -> typing.Sequence[int]:
@@ -165,95 +156,6 @@ def total_tests(self) -> int:
         """
         return sum(1 for pval in self.pvals if not math.isnan(pval))
 
-    def summarize(
-        self,
-        hpo: hpotk.MinimalOntology,
-        target_phenotype_category: PatientCategory,
-    ) -> pd.DataFrame:
-        """
-        Create a data frame with summary of the genotype phenotype analysis.
-
-        The *rows* of the frame correspond to the analyzed HPO terms.
-
-        The columns of the data frame have `Count` and `Percentage` per used genotype predicate.
-
-        **Example**
-
-        If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate`
-        which can compare phenotype with and without a missense variant, we will have a data frame
-        that looks like this::
-
-            MISSENSE_VARIANT on `NM_1234.5`                       No                  Yes
-                                                                  Count    Percent    Count    Percent   Corrected p value  p value
-            Arachnodactyly [HP:0001166]                           1/10         10%    13/16        81%   0.020299           0.000781
-            Abnormality of the musculature [HP:0003011]           6/6         100%    11/11       100%   1.000000           1.000000
-            Abnormal nervous system physiology [HP:0012638]       9/9         100%    15/15       100%   1.000000           1.000000
-            ...                                                   ...      ...        ...      ...       ...                ...
-        """
-
-        # TODO: this should be plotted by a formatter.
-        # Row index: a list of tested HPO terms
-        pheno_idx = pd.Index(self.phenotypes)
-        # Column index: multiindex of counts and percentages for all genotype predicate groups
-        gt_idx = pd.MultiIndex.from_product(
-            # TODO: fix the below
-            iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")),
-            names=(self._gt_predicate.get_question_base(), None),
-        )
-
-        # We'll fill this frame with data
-        df = pd.DataFrame(index=pheno_idx, columns=gt_idx)
-
-        for phenotype, count in zip(self.phenotypes, self.all_counts):
-            gt_totals = (
-                count.sum()
-            )  # Sum across the phenotype categories (collapse the rows).
-            for gt_cat in count.columns:
-                cnt = count.loc[target_phenotype_category, gt_cat]
-                total = gt_totals[gt_cat]
-                df.loc[phenotype, (gt_cat, "Count")] = f"{cnt}/{total}"
-                pct = 0 if total == 0 else round(cnt * 100 / total)
-                df.loc[phenotype, (gt_cat, "Percent")] = f"{pct}%"
-
-        # Add columns with p values and corrected p values (if present)
-        p_val_col_name = "p values"
-        corrected_p_val_col_name = "Corrected p values"
-        if self.corrected_pvals is not None:
-            df.insert(
-                df.shape[1], ("", corrected_p_val_col_name), self.corrected_pvals
-            )
-        df.insert(df.shape[1], ("", p_val_col_name), self.pvals)
-
-        # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms
-        # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`).
-        labeled_idx = df.index.map(
-            lambda term_id: MultiPhenotypeAnalysisResult._format_term_id(hpo, term_id)
-        )
-
-        # Last, sort by corrected p value or just p value
-        df = df.set_index(labeled_idx)
-        if self.corrected_pvals is not None:
-            return df.sort_values(
-                by=[("", corrected_p_val_col_name), ("", p_val_col_name)]
-            )
-        else:
-            return df.sort_values(by=("", p_val_col_name))
-
-    @staticmethod
-    def _format_term_id(
-        hpo: hpotk.MinimalOntology,
-        term_id: hpotk.TermId,
-    ) -> str:
-        """
-        Format a `term_id` as a `str`. HPO term ID is formatted as `<name> [<term_id>]` whereas other term IDs
-        are formatted as CURIEs (e.g. `OMIM:123000`).
-        """
-        if term_id.prefix == "HP":
-            term_name = hpo.get_term_name(term_id)
-            return f"{term_name} [{term_id.value}]"
-        else:
-            return term_id.value
-
 
 class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta):
 
@@ -377,7 +279,8 @@ def _compute_nominal_pvals(
     def _apply_mtc(
         self,
         pvals: typing.Sequence[float],
-    ) -> typing.Optional[typing.Sequence[float]]:
+    ) -> typing.Sequence[float]:
+        assert self._mtc_correction is not None
         _, corrected_pvals, _, _ = multitest.multipletests(
             pvals=pvals,
             alpha=self._mtc_alpha,
@@ -392,14 +295,14 @@ class BaseMultiPhenotypeAnalysisResult(typing.Generic[P], MultiPhenotypeAnalysis
 
     def __init__(
         self,
-        phenotypes: typing.Sequence[P],
+        pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
         n_usable: typing.Sequence[int],
         all_counts: typing.Sequence[pd.DataFrame],
         pvals: typing.Sequence[float],
         corrected_pvals: typing.Optional[typing.Sequence[float]],
         gt_predicate: GenotypePolyPredicate,
     ):
-        self._phenotypes = tuple(phenotypes)
+        self._pheno_predicates = tuple(pheno_predicates)
         self._n_usable = tuple(n_usable)
         self._all_counts = tuple(all_counts)
         self._pvals = tuple(pvals)
@@ -409,9 +312,19 @@ def __init__(
         assert isinstance(gt_predicate, GenotypePolyPredicate)
         self._gt_predicate = gt_predicate
 
+    @property
+    def gt_predicate(self) -> GenotypePolyPredicate:
+        return self._gt_predicate
+
+    @property
+    def pheno_predicates(
+        self,
+    ) -> typing.Sequence[PhenotypePolyPredicate[P]]:
+        return self._pheno_predicates
+
     @property
     def phenotypes(self) -> typing.Sequence[P]:
-        return self._phenotypes
+        return tuple(p.phenotype for p in self._pheno_predicates)
 
     @property
     def n_usable(self) -> typing.Sequence[int]:
@@ -431,7 +344,7 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]:
 
     def __eq__(self, other):
         return isinstance(other, BaseMultiPhenotypeAnalysisResult) \
-            and self._phenotypes == other._phenotypes \
+            and self._pheno_predicates == other._pheno_predicates \
             and self._n_usable == other._n_usable \
             and self._all_counts == other._all_counts \
             and self._pvals == other._pvals \
@@ -441,7 +354,7 @@ def __eq__(self, other):
     def __hash__(self):
         # NOTE: if a field is added, the hash method of the subclasses must be updated as well.
         return hash((
-            self._phenotypes, self._n_usable, self._all_counts,
+            self._pheno_predicates, self._n_usable, self._all_counts,
             self._pvals, self._corrected_pvals, self._gt_predicate,
         ))
 
@@ -474,10 +387,8 @@ def _compute_result(
         else:
             corrected_pvals = self._apply_mtc(pvals=pvals)
 
-        phenotypes = tuple(p.phenotype for p in pheno_predicates)
-
         return BaseMultiPhenotypeAnalysisResult(
-            phenotypes=phenotypes,
+            pheno_predicates=pheno_predicates,
             n_usable=n_usable,
             all_counts=all_counts,
             pvals=pvals,
@@ -496,7 +407,7 @@ class HpoTermAnalysisResult(BaseMultiPhenotypeAnalysisResult[hpotk.TermId]):
 
     def __init__(
         self,
-        phenotypes: typing.Sequence[hpotk.TermId],
+        pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]],
         n_usable: typing.Sequence[int],
         all_counts: typing.Sequence[pd.DataFrame],
         pvals: typing.Sequence[float],
@@ -507,7 +418,7 @@ def __init__(
         mtc_name: typing.Optional[str],
     ):
         super().__init__(
-            phenotypes=phenotypes,
+            pheno_predicates=pheno_predicates,
             n_usable=n_usable,
             all_counts=all_counts,
             pvals=pvals,
@@ -549,7 +460,7 @@ def __eq__(self, other):
 
     def __hash__(self):
         return hash((
-            self._phenotypes, self._n_usable, self._all_counts,
+            self._pheno_predicates, self._n_usable, self._all_counts,
             self._pvals, self._corrected_pvals, self._gt_predicate,
             self._mtc_filter_name, self._mtc_filter_results, self._mtc_name,
         ))
@@ -590,7 +501,6 @@ def _compute_result(
         pheno_predicates = tuple(pheno_predicates)
         if len(pheno_predicates) == 0:
             raise ValueError("No phenotype predicates were provided")
-        phenotypes = tuple(p.phenotype for p in pheno_predicates)
 
         # 1 - Count the patients
         n_usable, all_counts = apply_predicates_on_patients(
@@ -625,7 +535,7 @@ def _compute_result(
             corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask])
 
         return HpoTermAnalysisResult(
-            phenotypes=phenotypes,
+            pheno_predicates=pheno_predicates,
             n_usable=n_usable,
             all_counts=all_counts,
             pvals=pvals,

diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -13,6 +13,8 @@
 from ._api import VariantPredicate
 from ._counter import AlleleCounter
 
+# TODO: implement __hash__, __eq__ on predicates
+
 
 class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate):
     # NOT PART OF THE PUBLIC API

diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py
@@ -197,6 +197,15 @@ def test(
 
         return None
 
+    def __eq__(self, value: object) -> bool:
+        return isinstance(value, PropagatingPhenotypePredicate) \
+            and self._hpo.version == value._hpo.version \
+            and self._query == value._query \
+            and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded
+
+    def __hash__(self) -> int:
+        return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded))
+
     def __repr__(self):
         return f"PropagatingPhenotypeBooleanPredicate(query={self._query})"
 
@@ -264,5 +273,12 @@ def test(
 
         return self._diagnosis_excluded
 
+    def __eq__(self, value: object) -> bool:
+        return isinstance(value, DiseasePresencePredicate) \
+            and self._query == value._query
+
+    def __hash__(self) -> int:
+        return hash((self._query,))
+
     def __repr__(self):
         return f"DiseasePresencePredicate(query={self._query})"
diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py
@@ -1,5 +1,6 @@
 from ._cohort import CohortViewable
 from ._disease import DiseaseViewable
+from ._phenotype_analysis import summarize_hpo_analysis
 from ._protein_viewer import ProteinViewable
 from ._protein_visualizable import ProteinVisualizable
 from ._stats import MtcStatsViewer
@@ -12,6 +13,7 @@
     'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable',
     'DiseaseViewable',
     'MtcStatsViewer',
+    'summarize_hpo_analysis',
     'VariantTranscriptVisualizer',
     'VariantFormatter',
 ]