From fc72e79c677e7629d8b29571ccfbd256f7eecd87 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 5 Sep 2024 11:11:23 +0200 Subject: [PATCH 1/7] Ignore `dev` folder. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8ad720caa..777fd2adc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Cache with transcript/protein pickle files .gpsea_cache +dev/ # Byte-compiled / optimized / DLL files __pycache__/ From ac1422a3b9cbc1f3f38d55b39f55ffea7756a470 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sat, 7 Sep 2024 20:54:51 +0200 Subject: [PATCH 2/7] Keep track of phenotype predicates in the results. --- src/gpsea/analysis/pcats/_impl.py | 23 ++++++++----------- .../predicate/genotype/_gt_predicates.py | 3 ++- .../analysis/predicate/phenotype/_pheno.py | 16 +++++++++++++ tests/view/test_stats.py | 14 ++++++++--- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index e8045d2d0..acc4b9210 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -392,14 +392,14 @@ class BaseMultiPhenotypeAnalysisResult(typing.Generic[P], MultiPhenotypeAnalysis def __init__( self, - phenotypes: typing.Sequence[P], + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], pvals: typing.Sequence[float], corrected_pvals: typing.Optional[typing.Sequence[float]], gt_predicate: GenotypePolyPredicate, ): - self._phenotypes = tuple(phenotypes) + self._pheno_predicates = tuple(pheno_predicates) self._n_usable = tuple(n_usable) self._all_counts = tuple(all_counts) self._pvals = tuple(pvals) @@ -411,7 +411,7 @@ def __init__( @property def phenotypes(self) -> typing.Sequence[P]: - return self._phenotypes + return tuple(p.phenotype for p in self._pheno_predicates) @property def n_usable(self) -> typing.Sequence[int]: @@ -431,7 +431,7 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: def __eq__(self, other): return isinstance(other, BaseMultiPhenotypeAnalysisResult) \ - and self._phenotypes == other._phenotypes \ + and self._pheno_predicates == other._pheno_predicates \ and self._n_usable == other._n_usable \ and self._all_counts == other._all_counts \ and self._pvals == other._pvals \ @@ -441,7 +441,7 @@ def __eq__(self, other): def __hash__(self): # NOTE: if a field is added, the hash method of the subclasses must be updated as well. return hash(( - self._phenotypes, self._n_usable, self._all_counts, + self._pheno_predicates, self._n_usable, self._all_counts, self._pvals, self._corrected_pvals, self._gt_predicate, )) @@ -474,10 +474,8 @@ def _compute_result( else: corrected_pvals = self._apply_mtc(pvals=pvals) - phenotypes = tuple(p.phenotype for p in pheno_predicates) - return BaseMultiPhenotypeAnalysisResult( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, @@ -496,7 +494,7 @@ class HpoTermAnalysisResult(BaseMultiPhenotypeAnalysisResult[hpotk.TermId]): def __init__( self, - phenotypes: typing.Sequence[hpotk.TermId], + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], pvals: typing.Sequence[float], @@ -507,7 +505,7 @@ def __init__( mtc_name: typing.Optional[str], ): super().__init__( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, @@ -549,7 +547,7 @@ def __eq__(self, other): def __hash__(self): return hash(( - self._phenotypes, self._n_usable, self._all_counts, + self._pheno_predicates, self._n_usable, self._all_counts, self._pvals, self._corrected_pvals, self._gt_predicate, self._mtc_filter_name, self._mtc_filter_results, self._mtc_name, )) @@ -590,7 +588,6 @@ def _compute_result( pheno_predicates = tuple(pheno_predicates) if len(pheno_predicates) == 0: raise ValueError("No phenotype predicates were provided") - phenotypes = tuple(p.phenotype for p in pheno_predicates) # 1 - Count the patients n_usable, all_counts = apply_predicates_on_patients( @@ -625,7 +622,7 @@ def _compute_result( corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask]) return HpoTermAnalysisResult( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 3a9301273..7c4ddcfd2 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,4 +1,3 @@ -from cProfile import label import dataclasses import enum import typing @@ -14,6 +13,8 @@ from ._api import VariantPredicate from ._counter import AlleleCounter +# TODO: implement __hash__, __eq__ on predicates + class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 3b2d9a7b4..7d8a9a530 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -195,6 +195,15 @@ def test( return None + def __eq__(self, value: object) -> bool: + return isinstance(value, PropagatingPhenotypePredicate) \ + and self._hpo.version == value._hpo.version \ + and self._query == value._query \ + and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded + + def __hash__(self) -> int: + return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded)) + def __repr__(self): return f"PropagatingPhenotypeBooleanPredicate(query={self._query})" @@ -262,5 +271,12 @@ def test( return self._diagnosis_excluded + def __eq__(self, value: object) -> bool: + return isinstance(value, DiseasePresencePredicate) \ + and self._query == value._query + + def __hash__(self) -> int: + return hash((self._query,)) + def __repr__(self): return f"DiseasePresencePredicate(query={self._query})" diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index 691f4f6a4..d38556f67 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -7,6 +7,7 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate import PatientCategories from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.view import MtcStatsViewer @@ -16,12 +17,19 @@ class TestStatsViewable: @pytest.fixture(scope='class') def hpo_term_analysis_result( self, + hpo: hpotk.MinimalOntology, suox_gt_predicate: GenotypePolyPredicate, ) -> HpoTermAnalysisResult: return HpoTermAnalysisResult( - phenotypes=( - hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly - hpotk.TermId.from_curie('HP:0001250'), # Seizure + pheno_predicates=( + PropagatingPhenotypePredicate( + hpo=hpo, + query=hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly + ), + PropagatingPhenotypePredicate( + hpo=hpo, + query=hpotk.TermId.from_curie('HP:0001250'), # Seizure + ), ), n_usable=(40, 20), all_counts=( From ddc35c3940c81ef4d45aa0458582c847770c44a1 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sat, 7 Sep 2024 23:12:43 +0200 Subject: [PATCH 3/7] Create `HpoTermAnalysisResultFormatter`. --- src/gpsea/analysis/pcats/_impl.py | 111 ++---------------- src/gpsea/view/__init__.py | 2 + src/gpsea/view/_phenotype_analysis.py | 91 ++++++++++++++ tests/conftest.py | 58 ++++++++- .../view/test_hpo_phenotype_result_viewer.py | 27 +++++ 5 files changed, 189 insertions(+), 100 deletions(-) create mode 100644 src/gpsea/view/_phenotype_analysis.py create mode 100644 tests/view/test_hpo_phenotype_result_viewer.py diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index acc4b9210..c7eb6a616 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -13,7 +13,6 @@ from gpsea.model import Patient from gpsea.analysis.pcats.stats import CountStatistic -from ..predicate import PatientCategory from ..predicate.genotype import GenotypePolyPredicate from ..predicate.phenotype import P, PhenotypePolyPredicate from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult @@ -101,14 +100,6 @@ class MultiPhenotypeAnalysisResult(typing.Generic[P], metaclass=abc.ABCMeta): with the order determined by the phenotype order. """ - @property - @abc.abstractmethod - def phenotypes(self) -> typing.Sequence[P]: - """ - Get the tested phenotypes. - """ - pass - @property @abc.abstractmethod def n_usable(self) -> typing.Sequence[int]: @@ -165,95 +156,6 @@ def total_tests(self) -> int: """ return sum(1 for pval in self.pvals if not math.isnan(pval)) - def summarize( - self, - hpo: hpotk.MinimalOntology, - target_phenotype_category: PatientCategory, - ) -> pd.DataFrame: - """ - Create a data frame with summary of the genotype phenotype analysis. - - The *rows* of the frame correspond to the analyzed HPO terms. - - The columns of the data frame have `Count` and `Percentage` per used genotype predicate. - - **Example** - - If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate` - which can compare phenotype with and without a missense variant, we will have a data frame - that looks like this:: - - MISSENSE_VARIANT on `NM_1234.5` No Yes - Count Percent Count Percent Corrected p value p value - Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.020299 0.000781 - Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000 - Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000 - ... ... ... ... ... ... ... - """ - - # TODO: this should be plotted by a formatter. - # Row index: a list of tested HPO terms - pheno_idx = pd.Index(self.phenotypes) - # Column index: multiindex of counts and percentages for all genotype predicate groups - gt_idx = pd.MultiIndex.from_product( - # TODO: fix the below - iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")), - names=(self._gt_predicate.get_question_base(), None), - ) - - # We'll fill this frame with data - df = pd.DataFrame(index=pheno_idx, columns=gt_idx) - - for phenotype, count in zip(self.phenotypes, self.all_counts): - gt_totals = ( - count.sum() - ) # Sum across the phenotype categories (collapse the rows). - for gt_cat in count.columns: - cnt = count.loc[target_phenotype_category, gt_cat] - total = gt_totals[gt_cat] - df.loc[phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" - pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[phenotype, (gt_cat, "Percent")] = f"{pct}%" - - # Add columns with p values and corrected p values (if present) - p_val_col_name = "p values" - corrected_p_val_col_name = "Corrected p values" - if self.corrected_pvals is not None: - df.insert( - df.shape[1], ("", corrected_p_val_col_name), self.corrected_pvals - ) - df.insert(df.shape[1], ("", p_val_col_name), self.pvals) - - # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms - # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). - labeled_idx = df.index.map( - lambda term_id: MultiPhenotypeAnalysisResult._format_term_id(hpo, term_id) - ) - - # Last, sort by corrected p value or just p value - df = df.set_index(labeled_idx) - if self.corrected_pvals is not None: - return df.sort_values( - by=[("", corrected_p_val_col_name), ("", p_val_col_name)] - ) - else: - return df.sort_values(by=("", p_val_col_name)) - - @staticmethod - def _format_term_id( - hpo: hpotk.MinimalOntology, - term_id: hpotk.TermId, - ) -> str: - """ - Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs - are formatted as CURIEs (e.g. `OMIM:123000`). - """ - if term_id.prefix == "HP": - term_name = hpo.get_term_name(term_id) - return f"{term_name} [{term_id.value}]" - else: - return term_id.value - class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta): @@ -377,7 +279,8 @@ def _compute_nominal_pvals( def _apply_mtc( self, pvals: typing.Sequence[float], - ) -> typing.Optional[typing.Sequence[float]]: + ) -> typing.Sequence[float]: + assert self._mtc_correction is not None _, corrected_pvals, _, _ = multitest.multipletests( pvals=pvals, alpha=self._mtc_alpha, @@ -409,6 +312,16 @@ def __init__( assert isinstance(gt_predicate, GenotypePolyPredicate) self._gt_predicate = gt_predicate + @property + def gt_predicate(self) -> GenotypePolyPredicate: + return self._gt_predicate + + @property + def pheno_predicates( + self, + ) -> typing.Sequence[PhenotypePolyPredicate[P]]: + return self._pheno_predicates + @property def phenotypes(self) -> typing.Sequence[P]: return tuple(p.phenotype for p in self._pheno_predicates) diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index 6eb13dd61..e236f802c 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,5 +1,6 @@ from ._cohort import CohortViewable from ._disease import DiseaseViewable +from ._phenotype_analysis import HpoTermAnalysisResultFormatter from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable from ._stats import MtcStatsViewer @@ -12,6 +13,7 @@ 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', 'MtcStatsViewer', + 'HpoTermAnalysisResultFormatter', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py new file mode 100644 index 000000000..26cec71bd --- /dev/null +++ b/src/gpsea/view/_phenotype_analysis.py @@ -0,0 +1,91 @@ +import hpotk +import pandas as pd + +from gpsea.analysis.pcats import HpoTermAnalysisResult + + +class HpoTermAnalysisResultFormatter: + """ + `HpoTermAnalysisResultFormatter` presents the :class:`~gpsea.analysis.pcats.HpoTermAnalysisResult` + in a form suitable for humans. + + This includes preparing a dataframe with counts, frequencies, and p values for the tested HPO terms. + """ + + def __init__( + self, + hpo: hpotk.MinimalOntology, + ): + self._hpo = hpo + + def make_summary_dataframe( + self, + result: HpoTermAnalysisResult, + ) -> pd.DataFrame: + """ + Format the `result` into a dataframe for human consumption. + """ + assert isinstance(result, HpoTermAnalysisResult) + + # Row index: a list of tested HPO terms + pheno_idx = pd.Index(result.phenotypes) + # Column index: multiindex of counts and percentages for all genotype predicate groups + gt_idx = pd.MultiIndex.from_product( + iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), + names=(result.gt_predicate.get_question_base(), None), + ) + + # We'll fill this frame with data + df = pd.DataFrame(index=pheno_idx, columns=gt_idx) + + for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): + # Sum across the phenotype categories (collapse the rows). + gt_totals = count.sum() + + for gt_cat in count.columns: + cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] + total = gt_totals[gt_cat] + df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" + pct = 0 if total == 0 else round(cnt * 100 / total) + df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" + + # Add columns with p values and corrected p values (if present) + p_val_col_name = "p values" + corrected_p_val_col_name = "Corrected p values" + if result.corrected_pvals is not None: + df.insert( + df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals + ) + df.insert(df.shape[1], ("", p_val_col_name), result.pvals) + + # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms + # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). + labeled_idx = df.index.map( + lambda term_id: HpoTermAnalysisResultFormatter._format_term_id( + self._hpo, term_id + ) + ) + + # Last, sort by corrected p value or just p value + df = df.set_index(labeled_idx) + if result.corrected_pvals is not None: + return df.sort_values( + by=[("", corrected_p_val_col_name), ("", p_val_col_name)] + ) + else: + return df.sort_values(by=("", p_val_col_name)) + + @staticmethod + def _format_term_id( + hpo: hpotk.MinimalOntology, + term_id: hpotk.TermId, + ) -> str: + """ + Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs + are formatted as CURIEs (e.g. `OMIM:123000`). + """ + if term_id.prefix == "HP": + term_name = hpo.get_term_name(term_id) + return f"{term_name} [{term_id.value}]" + else: + return term_id.value diff --git a/tests/conftest.py b/tests/conftest.py index b66b55e7d..488e08698 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,11 +3,15 @@ import typing import hpotk +import numpy as np +import pandas as pd import pytest +from gpsea.analysis.mtc_filter import PhenotypeMtcResult +from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate -from gpsea.io import GpseaJSONEncoder, GpseaJSONDecoder +from gpsea.io import GpseaJSONDecoder from gpsea.model import * from gpsea.model.genome import GRCh38, GenomicRegion, Region, Strand, GenomeBuild from ._protein_test_service import ProteinTestMetadataService @@ -32,6 +36,7 @@ def pytest_collection_modifyitems(config, items): if "online" in item.keywords: item.add_marker(skip_online) + @pytest.fixture(scope='session') def fpath_project_dir(fpath_test_dir: str) -> str: """ @@ -151,6 +156,57 @@ def suox_pheno_predicates( ) +@pytest.fixture +def hpo_result( + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_predicate: GenotypePolyPredicate, +) -> HpoTermAnalysisResult: + return HpoTermAnalysisResult( + pheno_predicates=suox_pheno_predicates, + n_usable=(40, 20, 30, 10, 100), + all_counts=tuple( + make_count_df(count, suox_gt_predicate, ph_pred) + for count, ph_pred in zip( + [ + (10, 5, 15, 10), + (5, 2, 8, 5), + (10, 5, 5, 10), + (2, 3, 2, 3), + (10, 25, 35, 30), + ], + suox_pheno_predicates, + ) + ), + pvals=(0.01, 0.04, 0.3, 0.2, 0.7), + corrected_pvals=(0.04, 0.1, 0.7, 0.5, 1.0), + gt_predicate=suox_gt_predicate, + mtc_filter_name="MTC filter name", + mtc_filter_results=( + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + ), + mtc_name="MTC name", + ) + + +def make_count_df( + counts: typing.Tuple[int, int, int, int], + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], +) -> pd.DataFrame: + rows = tuple(ph_predicate.get_categories()) + cols = tuple(gt_predicate.get_categories()) + data = np.array(counts).reshape((len(rows), len(cols))) + return pd.DataFrame( + data=data, + index=pd.Index(rows), + columns=pd.Index(cols), + ) + + @pytest.fixture(scope='session') def fpath_suox_tx_coordinates(fpath_test_data_dir: str) -> str: suox_mane_tx_id = 'NM_001032386.2' diff --git a/tests/view/test_hpo_phenotype_result_viewer.py b/tests/view/test_hpo_phenotype_result_viewer.py new file mode 100644 index 000000000..71122eb9c --- /dev/null +++ b/tests/view/test_hpo_phenotype_result_viewer.py @@ -0,0 +1,27 @@ +import hpotk +import pytest + +from gpsea.analysis.pcats import HpoTermAnalysisResult +from gpsea.view import HpoTermAnalysisResultFormatter + + +class TestHpoTermAnalysisResultFormatter: + + @pytest.fixture + def formatter( + self, + hpo: hpotk.MinimalOntology, + ) -> HpoTermAnalysisResultFormatter: + return HpoTermAnalysisResultFormatter( + hpo=hpo, + ) + + @pytest.mark.skip("Only for manual control") + def test_summarize( + self, + formatter: HpoTermAnalysisResultFormatter, + hpo_result: HpoTermAnalysisResult, + ): + df = formatter.make_summary_dataframe(result=hpo_result) + + print(df) From db445a5d138ef67f75fba5772c17b27ab40324e4 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sat, 7 Sep 2024 23:31:48 +0200 Subject: [PATCH 4/7] Use the HPO analysis result formatter in the docs. --- docs/report/tbx5_frameshift_vs_missense.csv | 8 ++++---- docs/tutorial.rst | 5 +++-- docs/user-guide/report/tbx5_frameshift.csv | 2 +- docs/user-guide/stats.rst | 5 +++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 0bc1e5c6c..772f062f9 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,4 +1,4 @@ -"Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,, +Genotype group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05 Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214 @@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.286867598598 Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 +Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 -Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 -Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,, -Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,, +Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,, +Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,, Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 0a583b39e..d51d38685 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -268,8 +268,9 @@ by exploring the phenotype MTC filtering report. and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: ->>> from gpsea.analysis.predicate import PatientCategories ->>> summary_df = result.summarize(hpo, PatientCategories.YES) +>>> from gpsea.view import HpoTermAnalysisResultFormatter +>>> formatter = HpoTermAnalysisResultFormatter(hpo) +>>> summary_df = formatter.make_summary_dataframe(result) >>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index a96700cfb..1f78c1c16 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,4 +1,4 @@ -"Which genotype group does the patient fit in: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, +Which genotype group does the patient fit in,HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 55757ef30..c25870507 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -254,8 +254,9 @@ Last, let's explore the associations. The results include a table with all teste ordered by the corrected p value (Benjamini-Hochberg FDR). Here we show the top 20 table rows: ->>> from gpsea.analysis.predicate import PatientCategories ->>> summary_df = result.summarize(hpo, PatientCategories.YES) +>>> from gpsea.view import HpoTermAnalysisResultFormatter +>>> formatter = HpoTermAnalysisResultFormatter(hpo) +>>> summary_df = formatter.make_summary_dataframe(result) >>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest From 81846c1b2f0d7b3f1bb42e67866d1797b2f641d7 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 13:50:20 +0200 Subject: [PATCH 5/7] Rename `HpoTermAnalysisResultFormatter` to `HpoTermAnalysisResultViewer`. --- docs/tutorial.rst | 6 +++--- docs/user-guide/stats.rst | 6 +++--- src/gpsea/view/__init__.py | 4 ++-- src/gpsea/view/_phenotype_analysis.py | 6 +++--- tests/view/test_hpo_phenotype_result_viewer.py | 8 ++++---- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index d51d38685..1f20fb8f3 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -268,9 +268,9 @@ by exploring the phenotype MTC filtering report. and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: ->>> from gpsea.view import HpoTermAnalysisResultFormatter ->>> formatter = HpoTermAnalysisResultFormatter(hpo) ->>> summary_df = formatter.make_summary_dataframe(result) +>>> from gpsea.view import HpoTermAnalysisResultViewer +>>> result_viewer = HpoTermAnalysisResultViewer(hpo) +>>> summary_df = result_viewer.make_summary_dataframe(result) >>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index c25870507..47b02162b 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -254,9 +254,9 @@ Last, let's explore the associations. The results include a table with all teste ordered by the corrected p value (Benjamini-Hochberg FDR). Here we show the top 20 table rows: ->>> from gpsea.view import HpoTermAnalysisResultFormatter ->>> formatter = HpoTermAnalysisResultFormatter(hpo) ->>> summary_df = formatter.make_summary_dataframe(result) +>>> from gpsea.view import HpoTermAnalysisResultViewer +>>> result_viewer = HpoTermAnalysisResultViewer(hpo) +>>> summary_df = result_viewer.make_summary_dataframe(result) >>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index e236f802c..b49ccfe6b 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,6 +1,6 @@ from ._cohort import CohortViewable from ._disease import DiseaseViewable -from ._phenotype_analysis import HpoTermAnalysisResultFormatter +from ._phenotype_analysis import HpoTermAnalysisResultViewer from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable from ._stats import MtcStatsViewer @@ -13,7 +13,7 @@ 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', 'MtcStatsViewer', - 'HpoTermAnalysisResultFormatter', + 'HpoTermAnalysisResultViewer', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py index 26cec71bd..197643b64 100644 --- a/src/gpsea/view/_phenotype_analysis.py +++ b/src/gpsea/view/_phenotype_analysis.py @@ -4,9 +4,9 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult -class HpoTermAnalysisResultFormatter: +class HpoTermAnalysisResultViewer: """ - `HpoTermAnalysisResultFormatter` presents the :class:`~gpsea.analysis.pcats.HpoTermAnalysisResult` + `HpoTermAnalysisResultViewer` presents the :class:`~gpsea.analysis.pcats.HpoTermAnalysisResult` in a form suitable for humans. This includes preparing a dataframe with counts, frequencies, and p values for the tested HPO terms. @@ -61,7 +61,7 @@ def make_summary_dataframe( # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). labeled_idx = df.index.map( - lambda term_id: HpoTermAnalysisResultFormatter._format_term_id( + lambda term_id: HpoTermAnalysisResultViewer._format_term_id( self._hpo, term_id ) ) diff --git a/tests/view/test_hpo_phenotype_result_viewer.py b/tests/view/test_hpo_phenotype_result_viewer.py index 71122eb9c..09d36b807 100644 --- a/tests/view/test_hpo_phenotype_result_viewer.py +++ b/tests/view/test_hpo_phenotype_result_viewer.py @@ -2,7 +2,7 @@ import pytest from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.view import HpoTermAnalysisResultFormatter +from gpsea.view import HpoTermAnalysisResultViewer class TestHpoTermAnalysisResultFormatter: @@ -11,15 +11,15 @@ class TestHpoTermAnalysisResultFormatter: def formatter( self, hpo: hpotk.MinimalOntology, - ) -> HpoTermAnalysisResultFormatter: - return HpoTermAnalysisResultFormatter( + ) -> HpoTermAnalysisResultViewer: + return HpoTermAnalysisResultViewer( hpo=hpo, ) @pytest.mark.skip("Only for manual control") def test_summarize( self, - formatter: HpoTermAnalysisResultFormatter, + formatter: HpoTermAnalysisResultViewer, hpo_result: HpoTermAnalysisResult, ): df = formatter.make_summary_dataframe(result=hpo_result) From e4cc40a157bb59065234b797991a3b5017ae0097 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 13:59:50 +0200 Subject: [PATCH 6/7] .. but let's make it a function. --- docs/tutorial.rst | 5 +- docs/user-guide/stats.rst | 5 +- src/gpsea/view/__init__.py | 4 +- src/gpsea/view/_phenotype_analysis.py | 127 ++++++++---------- .../view/test_hpo_phenotype_result_viewer.py | 29 ++-- 5 files changed, 69 insertions(+), 101 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 1f20fb8f3..dcecee150 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -268,9 +268,8 @@ by exploring the phenotype MTC filtering report. and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: ->>> from gpsea.view import HpoTermAnalysisResultViewer ->>> result_viewer = HpoTermAnalysisResultViewer(hpo) ->>> summary_df = result_viewer.make_summary_dataframe(result) +>>> from gpsea.view import summarize_hpo_analysis +>>> summary_df = summarize_hpo_analysis(hpo, result) >>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 47b02162b..13e5be53e 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -254,9 +254,8 @@ Last, let's explore the associations. The results include a table with all teste ordered by the corrected p value (Benjamini-Hochberg FDR). Here we show the top 20 table rows: ->>> from gpsea.view import HpoTermAnalysisResultViewer ->>> result_viewer = HpoTermAnalysisResultViewer(hpo) ->>> summary_df = result_viewer.make_summary_dataframe(result) +>>> from gpsea.view import summarize_hpo_analysis +>>> summary_df = summarize_hpo_analysis(hpo, result) >>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index b49ccfe6b..7bb12e8e3 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,6 +1,6 @@ from ._cohort import CohortViewable from ._disease import DiseaseViewable -from ._phenotype_analysis import HpoTermAnalysisResultViewer +from ._phenotype_analysis import summarize_hpo_analysis from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable from ._stats import MtcStatsViewer @@ -13,7 +13,7 @@ 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', 'MtcStatsViewer', - 'HpoTermAnalysisResultViewer', + 'summarize_hpo_analysis', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py index 197643b64..75b65866d 100644 --- a/src/gpsea/view/_phenotype_analysis.py +++ b/src/gpsea/view/_phenotype_analysis.py @@ -4,88 +4,69 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult -class HpoTermAnalysisResultViewer: +def summarize_hpo_analysis( + hpo: hpotk.MinimalOntology, + result: HpoTermAnalysisResult, +) -> pd.DataFrame: """ - `HpoTermAnalysisResultViewer` presents the :class:`~gpsea.analysis.pcats.HpoTermAnalysisResult` - in a form suitable for humans. + Create a dataframe with counts, frequencies, and p values for the tested HPO terms. - This includes preparing a dataframe with counts, frequencies, and p values for the tested HPO terms. + :param hpo: HPO data. + :param result: the HPO term analysis results to show. """ + assert isinstance(result, HpoTermAnalysisResult) - def __init__( - self, - hpo: hpotk.MinimalOntology, - ): - self._hpo = hpo + # Row index: a list of tested HPO terms + pheno_idx = pd.Index(result.phenotypes) + # Column index: multiindex of counts and percentages for all genotype predicate groups + gt_idx = pd.MultiIndex.from_product( + iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), + names=(result.gt_predicate.get_question_base(), None), + ) - def make_summary_dataframe( - self, - result: HpoTermAnalysisResult, - ) -> pd.DataFrame: - """ - Format the `result` into a dataframe for human consumption. - """ - assert isinstance(result, HpoTermAnalysisResult) + # We'll fill this frame with data + df = pd.DataFrame(index=pheno_idx, columns=gt_idx) - # Row index: a list of tested HPO terms - pheno_idx = pd.Index(result.phenotypes) - # Column index: multiindex of counts and percentages for all genotype predicate groups - gt_idx = pd.MultiIndex.from_product( - iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), - names=(result.gt_predicate.get_question_base(), None), - ) + for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): + # Sum across the phenotype categories (collapse the rows). + gt_totals = count.sum() - # We'll fill this frame with data - df = pd.DataFrame(index=pheno_idx, columns=gt_idx) + for gt_cat in count.columns: + cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] + total = gt_totals[gt_cat] + df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" + pct = 0 if total == 0 else round(cnt * 100 / total) + df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" - for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): - # Sum across the phenotype categories (collapse the rows). - gt_totals = count.sum() - - for gt_cat in count.columns: - cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] - total = gt_totals[gt_cat] - df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" - pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" + # Add columns with p values and corrected p values (if present) + p_val_col_name = "p values" + corrected_p_val_col_name = "Corrected p values" + if result.corrected_pvals is not None: + df.insert(df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals) + df.insert(df.shape[1], ("", p_val_col_name), result.pvals) - # Add columns with p values and corrected p values (if present) - p_val_col_name = "p values" - corrected_p_val_col_name = "Corrected p values" - if result.corrected_pvals is not None: - df.insert( - df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals - ) - df.insert(df.shape[1], ("", p_val_col_name), result.pvals) + # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms + # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). + labeled_idx = df.index.map(lambda term_id: format_term_id(hpo, term_id)) - # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms - # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). - labeled_idx = df.index.map( - lambda term_id: HpoTermAnalysisResultViewer._format_term_id( - self._hpo, term_id - ) - ) + # Last, sort by corrected p value or just p value + df = df.set_index(labeled_idx) + if result.corrected_pvals is not None: + return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]) + else: + return df.sort_values(by=("", p_val_col_name)) - # Last, sort by corrected p value or just p value - df = df.set_index(labeled_idx) - if result.corrected_pvals is not None: - return df.sort_values( - by=[("", corrected_p_val_col_name), ("", p_val_col_name)] - ) - else: - return df.sort_values(by=("", p_val_col_name)) - @staticmethod - def _format_term_id( - hpo: hpotk.MinimalOntology, - term_id: hpotk.TermId, - ) -> str: - """ - Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs - are formatted as CURIEs (e.g. `OMIM:123000`). - """ - if term_id.prefix == "HP": - term_name = hpo.get_term_name(term_id) - return f"{term_name} [{term_id.value}]" - else: - return term_id.value +def format_term_id( + hpo: hpotk.MinimalOntology, + term_id: hpotk.TermId, +) -> str: + """ + Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs + are formatted as CURIEs (e.g. `OMIM:123000`). + """ + if term_id.prefix == "HP": + term_name = hpo.get_term_name(term_id) + return f"{term_name} [{term_id.value}]" + else: + return term_id.value diff --git a/tests/view/test_hpo_phenotype_result_viewer.py b/tests/view/test_hpo_phenotype_result_viewer.py index 09d36b807..12e0df74f 100644 --- a/tests/view/test_hpo_phenotype_result_viewer.py +++ b/tests/view/test_hpo_phenotype_result_viewer.py @@ -2,26 +2,15 @@ import pytest from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.view import HpoTermAnalysisResultViewer +from gpsea.view import summarize_hpo_analysis -class TestHpoTermAnalysisResultFormatter: +@pytest.mark.skip("Only for manual control") +def test_summarize( + self, + hpo: hpotk.MinimalOntology, + hpo_result: HpoTermAnalysisResult, +): + df = summarize_hpo_analysis(hpo=hpo, result=hpo_result) - @pytest.fixture - def formatter( - self, - hpo: hpotk.MinimalOntology, - ) -> HpoTermAnalysisResultViewer: - return HpoTermAnalysisResultViewer( - hpo=hpo, - ) - - @pytest.mark.skip("Only for manual control") - def test_summarize( - self, - formatter: HpoTermAnalysisResultViewer, - hpo_result: HpoTermAnalysisResult, - ): - df = formatter.make_summary_dataframe(result=hpo_result) - - print(df) + print(df) From 9e75d23b1838083713850ada0488805652a1aea6 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 14:42:33 +0200 Subject: [PATCH 7/7] Fix import error. --- tests/view/test_stats.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index d38556f67..d57b3ae29 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -7,7 +7,7 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate import PatientCategories from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate +from gpsea.analysis.predicate.phenotype import HpoPredicate from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.view import MtcStatsViewer @@ -22,11 +22,11 @@ def hpo_term_analysis_result( ) -> HpoTermAnalysisResult: return HpoTermAnalysisResult( pheno_predicates=( - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001250'), # Seizure ),