diff --git a/.gitignore b/.gitignore index 8ad720caa..777fd2adc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Cache with transcript/protein pickle files .gpsea_cache +dev/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 4ab9175ad..772f062f9 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.286867598598 Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 -Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 -Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 +Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 -Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,, -Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,, -Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,, +Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 +Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,, +Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,, +Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 0a583b39e..dcecee150 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -268,8 +268,8 @@ by exploring the phenotype MTC filtering report. and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: ->>> from gpsea.analysis.predicate import PatientCategories ->>> summary_df = result.summarize(hpo, PatientCategories.YES) +>>> from gpsea.view import summarize_hpo_analysis +>>> summary_df = summarize_hpo_analysis(hpo, result) >>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 59ad7604e..5132b4b51 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -254,8 +254,8 @@ Last, let's explore the associations. The results include a table with all teste ordered by the corrected p value (Benjamini-Hochberg FDR). Here we show the top 20 table rows: ->>> from gpsea.analysis.predicate import PatientCategories ->>> summary_df = result.summarize(hpo, PatientCategories.YES) +>>> from gpsea.view import summarize_hpo_analysis +>>> summary_df = summarize_hpo_analysis(hpo, result) >>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index e8045d2d0..c7eb6a616 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -13,7 +13,6 @@ from gpsea.model import Patient from gpsea.analysis.pcats.stats import CountStatistic -from ..predicate import PatientCategory from ..predicate.genotype import GenotypePolyPredicate from ..predicate.phenotype import P, PhenotypePolyPredicate from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult @@ -101,14 +100,6 @@ class MultiPhenotypeAnalysisResult(typing.Generic[P], metaclass=abc.ABCMeta): with the order determined by the phenotype order. """ - @property - @abc.abstractmethod - def phenotypes(self) -> typing.Sequence[P]: - """ - Get the tested phenotypes. - """ - pass - @property @abc.abstractmethod def n_usable(self) -> typing.Sequence[int]: @@ -165,95 +156,6 @@ def total_tests(self) -> int: """ return sum(1 for pval in self.pvals if not math.isnan(pval)) - def summarize( - self, - hpo: hpotk.MinimalOntology, - target_phenotype_category: PatientCategory, - ) -> pd.DataFrame: - """ - Create a data frame with summary of the genotype phenotype analysis. - - The *rows* of the frame correspond to the analyzed HPO terms. - - The columns of the data frame have `Count` and `Percentage` per used genotype predicate. - - **Example** - - If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate` - which can compare phenotype with and without a missense variant, we will have a data frame - that looks like this:: - - MISSENSE_VARIANT on `NM_1234.5` No Yes - Count Percent Count Percent Corrected p value p value - Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.020299 0.000781 - Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000 - Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000 - ... ... ... ... ... ... ... - """ - - # TODO: this should be plotted by a formatter. - # Row index: a list of tested HPO terms - pheno_idx = pd.Index(self.phenotypes) - # Column index: multiindex of counts and percentages for all genotype predicate groups - gt_idx = pd.MultiIndex.from_product( - # TODO: fix the below - iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")), - names=(self._gt_predicate.get_question_base(), None), - ) - - # We'll fill this frame with data - df = pd.DataFrame(index=pheno_idx, columns=gt_idx) - - for phenotype, count in zip(self.phenotypes, self.all_counts): - gt_totals = ( - count.sum() - ) # Sum across the phenotype categories (collapse the rows). - for gt_cat in count.columns: - cnt = count.loc[target_phenotype_category, gt_cat] - total = gt_totals[gt_cat] - df.loc[phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" - pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[phenotype, (gt_cat, "Percent")] = f"{pct}%" - - # Add columns with p values and corrected p values (if present) - p_val_col_name = "p values" - corrected_p_val_col_name = "Corrected p values" - if self.corrected_pvals is not None: - df.insert( - df.shape[1], ("", corrected_p_val_col_name), self.corrected_pvals - ) - df.insert(df.shape[1], ("", p_val_col_name), self.pvals) - - # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms - # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). - labeled_idx = df.index.map( - lambda term_id: MultiPhenotypeAnalysisResult._format_term_id(hpo, term_id) - ) - - # Last, sort by corrected p value or just p value - df = df.set_index(labeled_idx) - if self.corrected_pvals is not None: - return df.sort_values( - by=[("", corrected_p_val_col_name), ("", p_val_col_name)] - ) - else: - return df.sort_values(by=("", p_val_col_name)) - - @staticmethod - def _format_term_id( - hpo: hpotk.MinimalOntology, - term_id: hpotk.TermId, - ) -> str: - """ - Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs - are formatted as CURIEs (e.g. `OMIM:123000`). - """ - if term_id.prefix == "HP": - term_name = hpo.get_term_name(term_id) - return f"{term_name} [{term_id.value}]" - else: - return term_id.value - class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta): @@ -377,7 +279,8 @@ def _compute_nominal_pvals( def _apply_mtc( self, pvals: typing.Sequence[float], - ) -> typing.Optional[typing.Sequence[float]]: + ) -> typing.Sequence[float]: + assert self._mtc_correction is not None _, corrected_pvals, _, _ = multitest.multipletests( pvals=pvals, alpha=self._mtc_alpha, @@ -392,14 +295,14 @@ class BaseMultiPhenotypeAnalysisResult(typing.Generic[P], MultiPhenotypeAnalysis def __init__( self, - phenotypes: typing.Sequence[P], + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], pvals: typing.Sequence[float], corrected_pvals: typing.Optional[typing.Sequence[float]], gt_predicate: GenotypePolyPredicate, ): - self._phenotypes = tuple(phenotypes) + self._pheno_predicates = tuple(pheno_predicates) self._n_usable = tuple(n_usable) self._all_counts = tuple(all_counts) self._pvals = tuple(pvals) @@ -409,9 +312,19 @@ def __init__( assert isinstance(gt_predicate, GenotypePolyPredicate) self._gt_predicate = gt_predicate + @property + def gt_predicate(self) -> GenotypePolyPredicate: + return self._gt_predicate + + @property + def pheno_predicates( + self, + ) -> typing.Sequence[PhenotypePolyPredicate[P]]: + return self._pheno_predicates + @property def phenotypes(self) -> typing.Sequence[P]: - return self._phenotypes + return tuple(p.phenotype for p in self._pheno_predicates) @property def n_usable(self) -> typing.Sequence[int]: @@ -431,7 +344,7 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: def __eq__(self, other): return isinstance(other, BaseMultiPhenotypeAnalysisResult) \ - and self._phenotypes == other._phenotypes \ + and self._pheno_predicates == other._pheno_predicates \ and self._n_usable == other._n_usable \ and self._all_counts == other._all_counts \ and self._pvals == other._pvals \ @@ -441,7 +354,7 @@ def __eq__(self, other): def __hash__(self): # NOTE: if a field is added, the hash method of the subclasses must be updated as well. return hash(( - self._phenotypes, self._n_usable, self._all_counts, + self._pheno_predicates, self._n_usable, self._all_counts, self._pvals, self._corrected_pvals, self._gt_predicate, )) @@ -474,10 +387,8 @@ def _compute_result( else: corrected_pvals = self._apply_mtc(pvals=pvals) - phenotypes = tuple(p.phenotype for p in pheno_predicates) - return BaseMultiPhenotypeAnalysisResult( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, @@ -496,7 +407,7 @@ class HpoTermAnalysisResult(BaseMultiPhenotypeAnalysisResult[hpotk.TermId]): def __init__( self, - phenotypes: typing.Sequence[hpotk.TermId], + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], pvals: typing.Sequence[float], @@ -507,7 +418,7 @@ def __init__( mtc_name: typing.Optional[str], ): super().__init__( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, @@ -549,7 +460,7 @@ def __eq__(self, other): def __hash__(self): return hash(( - self._phenotypes, self._n_usable, self._all_counts, + self._pheno_predicates, self._n_usable, self._all_counts, self._pvals, self._corrected_pvals, self._gt_predicate, self._mtc_filter_name, self._mtc_filter_results, self._mtc_name, )) @@ -590,7 +501,6 @@ def _compute_result( pheno_predicates = tuple(pheno_predicates) if len(pheno_predicates) == 0: raise ValueError("No phenotype predicates were provided") - phenotypes = tuple(p.phenotype for p in pheno_predicates) # 1 - Count the patients n_usable, all_counts = apply_predicates_on_patients( @@ -625,7 +535,7 @@ def _compute_result( corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask]) return HpoTermAnalysisResult( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 76eebcc55..341d17fbf 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -13,6 +13,8 @@ from ._api import VariantPredicate from ._counter import AlleleCounter +# TODO: implement __hash__, __eq__ on predicates + class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 4cb8a8041..01e692d82 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -197,6 +197,15 @@ def test( return None + def __eq__(self, value: object) -> bool: + return isinstance(value, PropagatingPhenotypePredicate) \ + and self._hpo.version == value._hpo.version \ + and self._query == value._query \ + and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded + + def __hash__(self) -> int: + return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded)) + def __repr__(self): return f"PropagatingPhenotypeBooleanPredicate(query={self._query})" @@ -264,5 +273,12 @@ def test( return self._diagnosis_excluded + def __eq__(self, value: object) -> bool: + return isinstance(value, DiseasePresencePredicate) \ + and self._query == value._query + + def __hash__(self) -> int: + return hash((self._query,)) + def __repr__(self): return f"DiseasePresencePredicate(query={self._query})" diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index 6eb13dd61..7bb12e8e3 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,5 +1,6 @@ from ._cohort import CohortViewable from ._disease import DiseaseViewable +from ._phenotype_analysis import summarize_hpo_analysis from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable from ._stats import MtcStatsViewer @@ -12,6 +13,7 @@ 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', 'MtcStatsViewer', + 'summarize_hpo_analysis', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py new file mode 100644 index 000000000..75b65866d --- /dev/null +++ b/src/gpsea/view/_phenotype_analysis.py @@ -0,0 +1,72 @@ +import hpotk +import pandas as pd + +from gpsea.analysis.pcats import HpoTermAnalysisResult + + +def summarize_hpo_analysis( + hpo: hpotk.MinimalOntology, + result: HpoTermAnalysisResult, +) -> pd.DataFrame: + """ + Create a dataframe with counts, frequencies, and p values for the tested HPO terms. + + :param hpo: HPO data. + :param result: the HPO term analysis results to show. + """ + assert isinstance(result, HpoTermAnalysisResult) + + # Row index: a list of tested HPO terms + pheno_idx = pd.Index(result.phenotypes) + # Column index: multiindex of counts and percentages for all genotype predicate groups + gt_idx = pd.MultiIndex.from_product( + iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), + names=(result.gt_predicate.get_question_base(), None), + ) + + # We'll fill this frame with data + df = pd.DataFrame(index=pheno_idx, columns=gt_idx) + + for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): + # Sum across the phenotype categories (collapse the rows). + gt_totals = count.sum() + + for gt_cat in count.columns: + cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] + total = gt_totals[gt_cat] + df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" + pct = 0 if total == 0 else round(cnt * 100 / total) + df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" + + # Add columns with p values and corrected p values (if present) + p_val_col_name = "p values" + corrected_p_val_col_name = "Corrected p values" + if result.corrected_pvals is not None: + df.insert(df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals) + df.insert(df.shape[1], ("", p_val_col_name), result.pvals) + + # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms + # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). + labeled_idx = df.index.map(lambda term_id: format_term_id(hpo, term_id)) + + # Last, sort by corrected p value or just p value + df = df.set_index(labeled_idx) + if result.corrected_pvals is not None: + return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]) + else: + return df.sort_values(by=("", p_val_col_name)) + + +def format_term_id( + hpo: hpotk.MinimalOntology, + term_id: hpotk.TermId, +) -> str: + """ + Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs + are formatted as CURIEs (e.g. `OMIM:123000`). + """ + if term_id.prefix == "HP": + term_name = hpo.get_term_name(term_id) + return f"{term_name} [{term_id.value}]" + else: + return term_id.value diff --git a/tests/conftest.py b/tests/conftest.py index 266837245..64ed2e817 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,11 +3,15 @@ import typing import hpotk +import numpy as np +import pandas as pd import pytest +from gpsea.analysis.mtc_filter import PhenotypeMtcResult +from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate -from gpsea.io import GpseaJSONEncoder, GpseaJSONDecoder +from gpsea.io import GpseaJSONDecoder from gpsea.model import * from gpsea.model.genome import GRCh38, GenomicRegion, Region, Strand, GenomeBuild from ._protein_test_service import ProteinTestMetadataService @@ -32,6 +36,7 @@ def pytest_collection_modifyitems(config, items): if "online" in item.keywords: item.add_marker(skip_online) + @pytest.fixture(scope='session') def fpath_project_dir(fpath_test_dir: str) -> str: """ @@ -151,6 +156,57 @@ def suox_pheno_predicates( ) +@pytest.fixture +def hpo_result( + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_predicate: GenotypePolyPredicate, +) -> HpoTermAnalysisResult: + return HpoTermAnalysisResult( + pheno_predicates=suox_pheno_predicates, + n_usable=(40, 20, 30, 10, 100), + all_counts=tuple( + make_count_df(count, suox_gt_predicate, ph_pred) + for count, ph_pred in zip( + [ + (10, 5, 15, 10), + (5, 2, 8, 5), + (10, 5, 5, 10), + (2, 3, 2, 3), + (10, 25, 35, 30), + ], + suox_pheno_predicates, + ) + ), + pvals=(0.01, 0.04, 0.3, 0.2, 0.7), + corrected_pvals=(0.04, 0.1, 0.7, 0.5, 1.0), + gt_predicate=suox_gt_predicate, + mtc_filter_name="MTC filter name", + mtc_filter_results=( + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + ), + mtc_name="MTC name", + ) + + +def make_count_df( + counts: typing.Tuple[int, int, int, int], + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], +) -> pd.DataFrame: + rows = tuple(ph_predicate.get_categories()) + cols = tuple(gt_predicate.get_categories()) + data = np.array(counts).reshape((len(rows), len(cols))) + return pd.DataFrame( + data=data, + index=pd.Index(rows), + columns=pd.Index(cols), + ) + + @pytest.fixture(scope='session') def fpath_suox_tx_coordinates(fpath_test_data_dir: str) -> str: suox_mane_tx_id = 'NM_001032386.2' diff --git a/tests/view/test_hpo_phenotype_result_viewer.py b/tests/view/test_hpo_phenotype_result_viewer.py new file mode 100644 index 000000000..12e0df74f --- /dev/null +++ b/tests/view/test_hpo_phenotype_result_viewer.py @@ -0,0 +1,16 @@ +import hpotk +import pytest + +from gpsea.analysis.pcats import HpoTermAnalysisResult +from gpsea.view import summarize_hpo_analysis + + +@pytest.mark.skip("Only for manual control") +def test_summarize( + self, + hpo: hpotk.MinimalOntology, + hpo_result: HpoTermAnalysisResult, +): + df = summarize_hpo_analysis(hpo=hpo, result=hpo_result) + + print(df) diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index 691f4f6a4..d57b3ae29 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -7,6 +7,7 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate import PatientCategories from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import HpoPredicate from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.view import MtcStatsViewer @@ -16,12 +17,19 @@ class TestStatsViewable: @pytest.fixture(scope='class') def hpo_term_analysis_result( self, + hpo: hpotk.MinimalOntology, suox_gt_predicate: GenotypePolyPredicate, ) -> HpoTermAnalysisResult: return HpoTermAnalysisResult( - phenotypes=( - hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly - hpotk.TermId.from_curie('HP:0001250'), # Seizure + pheno_predicates=( + HpoPredicate( + hpo=hpo, + query=hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly + ), + HpoPredicate( + hpo=hpo, + query=hpotk.TermId.from_curie('HP:0001250'), # Seizure + ), ), n_usable=(40, 20), all_counts=(