From 7241b08bbfffa4502c1ddaa4fcb96bd034e1e64c Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 4 Sep 2024 13:26:44 +0200 Subject: [PATCH 1/2] Remove obsolete analysis code. --- src/gpsea/analysis/__init__.py | 15 - src/gpsea/analysis/_api.py | 472 ------------------ src/gpsea/analysis/_config.py | 352 ------------- src/gpsea/analysis/_gp_analysis.py | 202 -------- src/gpsea/analysis/_gp_impl.py | 195 -------- src/gpsea/analysis/_stats.py | 154 ------ src/gpsea/analysis/_test_fisherExact.py | 28 -- src/gpsea/analysis/_util.py | 61 --- .../analysis/predicate/genotype/__init__.py | 4 +- src/gpsea/analysis/predicate/genotype/_api.py | 11 +- .../predicate/genotype/_gt_predicates.py | 66 +-- .../test_counting_scorer.py} | 0 tests/analysis/test_config.py | 52 -- tests/analysis/test_examples.py | 137 ----- 14 files changed, 4 insertions(+), 1745 deletions(-) delete mode 100644 src/gpsea/analysis/_api.py delete mode 100644 src/gpsea/analysis/_config.py delete mode 100644 src/gpsea/analysis/_gp_analysis.py delete mode 100644 src/gpsea/analysis/_gp_impl.py delete mode 100644 src/gpsea/analysis/_stats.py delete mode 100644 src/gpsea/analysis/_test_fisherExact.py delete mode 100644 src/gpsea/analysis/_util.py rename tests/analysis/{test_pscore.py => pscore/test_counting_scorer.py} (100%) delete mode 100644 tests/analysis/test_config.py delete mode 100644 tests/analysis/test_examples.py diff --git a/src/gpsea/analysis/__init__.py b/src/gpsea/analysis/__init__.py index 1adf3c2ee..e69de29bb 100644 --- a/src/gpsea/analysis/__init__.py +++ b/src/gpsea/analysis/__init__.py @@ -1,15 +0,0 @@ -from ._api import CohortAnalysis, GenotypePhenotypeAnalysisResult, HpoMtcReport -# TODO This should go away -from ._config import CohortAnalysisConfiguration, configure_cohort_analysis, configure_default_protein_metadata_service, MtcStrategy -from ._gp_analysis import apply_predicates_on_patients -from ._util import prepare_hpo_terms_of_interest - -__all__ = [ - 'configure_cohort_analysis', - 'CohortAnalysis', 'GenotypePhenotypeAnalysisResult', - 'CohortAnalysisConfiguration', 'MtcStrategy', - 'HpoMtcReport', - 'apply_predicates_on_patients', - 'configure_default_protein_metadata_service', - 'prepare_hpo_terms_of_interest', -] diff --git a/src/gpsea/analysis/_api.py b/src/gpsea/analysis/_api.py deleted file mode 100644 index b2b7d41d7..000000000 --- a/src/gpsea/analysis/_api.py +++ /dev/null @@ -1,472 +0,0 @@ -import abc -import typing -from collections import namedtuple, defaultdict - -import hpotk -import pandas as pd - -from gpsea.model import Patient -from gpsea.preprocessing import ProteinMetadataService -from .predicate import PolyPredicate, PatientCategory -from .predicate.genotype import GenotypePolyPredicate, VariantPredicate, ProteinPredicates -from .predicate.phenotype import P, PhenotypePolyPredicate -from .pscore import PhenotypeScorer, CountingPhenotypeScorer - -PatientsByHPO = namedtuple('PatientsByHPO', field_names=['all_with_hpo', 'all_without_hpo']) - - -class HpoMtcReport: - """ - Class to simplify reporting results of multiple testing filtering by HpoMtcFilter subclasses. - """ - # TODO: delete with no replacement. - - def __init__( - self, - filter_name: str, - mtc_name: str, - filter_results_map: typing.Mapping[str, int], - n_terms_before_filtering: int, - ): - """ - Args: - filter_name: name of the MTC filter strategy (e.g. `heuristic sampler`) - mtc_name: name of the MTC function (e.g. `bonferroni`) - filter_results_map: mapping with reasons for filtering out a term as keys, and counts of filtered terms as values - n_terms_before_filtering: the number of HPO terms before filtering - """ - self._filter_name = filter_name - self._mtc_name = mtc_name - self._results_map = filter_results_map - self._n_terms_before_filtering = n_terms_before_filtering - - @property - def filter_method(self) -> str: - """ - Returns: - the name of the HpoMtcFilter method used. - """ - return self._filter_name - - @property - def skipped_terms_dict(self) -> typing.Mapping[str, int]: - """ - Returns: - a mapping with reasons why an HPO term was skipped as keys and counts of the skipped terms as values. - """ - return self._results_map - - @property - def mtc_method(self) -> str: - """ - Returns: - the name of the multiple testing correction method used (e.g. `bonferroni`). - """ - return self._mtc_name - - @property - def n_terms_before_filtering(self) -> int: - """ - Get the number of terms before filtering. - """ - return self._n_terms_before_filtering - - -class GenotypePhenotypeAnalysisResult: - """ - `GenotypePhenotypeAnalysisResult` summarizes results of genotype-phenotype correlation analysis of a cohort. - """ - # TODO: delete and use `gpsea.analysis.pcats.MultiPhenotypeAnalysisResult`. - - def __init__( - self, - n_usable: typing.Mapping[P, int], - all_counts: typing.Mapping[P, pd.DataFrame], - pvals: pd.Series, - corrected_pvals: typing.Optional[pd.Series], - phenotype_categories: typing.Iterable[PatientCategory], - geno_predicate: PolyPredicate, - mtc_filter_report: typing.Optional[HpoMtcReport] = None - ): - self._n_usable = n_usable - self._all_counts = all_counts - self._pvals = pvals - self._corrected_pvals = corrected_pvals - self._phenotype_categories = tuple(phenotype_categories) - self._geno_predicate = geno_predicate - self._mtc_filter_report = mtc_filter_report - - @property - def n_usable(self) -> typing.Mapping[P, int]: - """ - Get a mapping from a phenotype `P` (either an HPO term or a disease ID) - to an `int` with the number of patients where the phenotype was assessable, - and are, thus, usable for genotype-phenotype correlation analysis. - """ - return self._n_usable - - @property - def all_counts(self) -> typing.Mapping[P, pd.DataFrame]: - """ - Get a mapping from the phenotype item to :class:`pandas.DataFrame` with counts of patients - in genotype and phenotype groups. - - An example for a genotype predicate that bins into two categories (`Yes` and `No`) based on presence - of a missense variant in transcript `NM_123456.7`, and phenotype predicate that checks - presence/absence of `HP:0001166` (a phenotype term):: - - Has MISSENSE_VARIANT in NM_123456.7 - No Yes - Present - Yes 1 13 - No 7 5 - - The rows correspond to the phenotype categories, and the columns represent the genotype categories. - """ - return self._all_counts - - @property - def pvals(self) -> pd.Series: - """ - Get a :class:`pandas.Series` with p values for each tested HPO term. - """ - return self._pvals - - @property - def corrected_pvals(self) -> typing.Optional[pd.Series]: - """ - Get an optional :class:`pandas.Series` with p values for each tested HPO term after multiple testing correction. - """ - return self._corrected_pvals - - @property - def phenotype_categories(self) -> typing.Sequence[PatientCategory]: - """ - Get a sequence of phenotype patient categories that can be investigated. - """ - return self._phenotype_categories - - @property - def total_tests(self) -> int: - """ - Get total count of tests that were run for this analysis. - """ - return len(self._all_counts) - - @property - def mtc_filter_report(self) -> typing.Optional[HpoMtcReport]: - return self._mtc_filter_report - - def summarize( - self, hpo: hpotk.MinimalOntology, - category: PatientCategory, - ) -> pd.DataFrame: - """ - Create a data frame with summary of the genotype phenotype analysis. - - The *rows* of the frame correspond to the analyzed HPO terms. - - The columns of the data frame have `Count` and `Percentage` per used genotype predicate. - - **Example** - - If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate` - which can compare phenotype with and without a missense variant, we will have a data frame - that looks like this:: - - MISSENSE_VARIANT on `NM_1234.5` No Yes - Count Percent Count Percent p value Corrected p value - Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.000781 0.020299 - Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000 - Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000 - ... ... ... ... ... ... ... - """ - if category not in self._phenotype_categories: - raise ValueError(f'Unknown phenotype category: {category}. Use one of {self._phenotype_categories}') - - # Row index: a list of tested HPO terms - pheno_idx = pd.Index(self._n_usable.keys()) - # Column index: multiindex of counts and percentages for all genotype predicate groups - geno_idx = pd.MultiIndex.from_product( - iterables=(self._geno_predicate.get_categories(), ('Count', 'Percent')), - names=(self._geno_predicate.get_question_base(), None), - ) - - # We'll fill this frame with data - df = pd.DataFrame(index=pheno_idx, columns=geno_idx) - - for pf, count in self._all_counts.items(): - gt_totals = count.sum() # Sum across the phenotype categories (collapse the rows). - for gt_cat in count.columns: - cnt = count.loc[category, gt_cat] - total = gt_totals[gt_cat] - df.loc[pf, (gt_cat, 'Count')] = f'{cnt}/{total}' - pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[pf, (gt_cat, 'Percent')] = f'{pct}%' - - # Add columns with p values and corrected p values (if present) - df.insert(df.shape[1], ('', self._pvals.name), self._pvals) - if self._corrected_pvals is not None: - df.insert(df.shape[1], ('', self._corrected_pvals.name), self._corrected_pvals) - - # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms - # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). - labeled_idx = df.index.map(lambda term_id: GenotypePhenotypeAnalysisResult._format_term_id(hpo, term_id)) - - # Last, sort by corrected p value or just p value - df = df.set_index(labeled_idx) - if self._corrected_pvals is not None: - return df.sort_values(by=[('', self._corrected_pvals.name), ('', self._pvals.name)]) - else: - return df.sort_values(by=('', self._pvals.name)) - - @staticmethod - def _format_term_id( - hpo: hpotk.MinimalOntology, - term_id: hpotk.TermId, - ) -> str: - """ - Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs - are formatted as CURIEs (e.g. `OMIM:123000`). - """ - if term_id.prefix == 'HP': - min_onto = hpo.get_term(term_id) - return f'{min_onto.name} [{term_id.value}]' - else: - return term_id.value - - -class PhenotypeScoreAnalysisResult: - """ - `PhenotypeScoreAnalysisResult` includes results of testing genotypes vs. phenotype scores. - - See :ref:`Mann Whitney U Test for phenotype score ` for more background. - """ - # TODO: delete and use `gpsea.analysis.pscore.PhenotypeScoreAnalysisResult` - - def __init__( - self, - genotype_phenotype_scores: pd.DataFrame, - p_value: float, - ): - self._genotype_phenotype_scores = genotype_phenotype_scores - self._p_value = p_value - - @property - def genotype_phenotype_scores( - self, - ) -> pd.DataFrame: - """ - Get the DataFrame with the genotype group and the phenotype score for each patient. - - The DataFrame has the following structure: - - ========== ======== ========= - patient_id genotype phenotype - ========== ======== ========= - patient_1 0 1 - patient_2 0 3 - patient_3 1 2 - ... ... ... - ========== ======== ========= - - The DataFrame index includes the patient IDs, and then there are 2 columns - with the `genotype` group id (:attr:`~gpsea.analysis.predicate.PatientCategory.cat_id`) - and the `phenotype` score. - """ - return self._genotype_phenotype_scores - - @property - def p_value(self) -> float: - return self._p_value - - def __str__(self) -> str: - return 'PhenotypeGroupAnalysisResult(' \ - f'genotype_phenotype_scores={self._genotype_phenotype_scores}, ' \ - f'p_value={self._p_value})' - - def __repr__(self) -> str: - return str(self) - - -class CohortAnalysis(metaclass=abc.ABCMeta): - """ - `CohortAnalysis` is a driver class for running genotype-phenotype correlation analyses. - - The class provides various methods to test genotype-phenotype correlations. All methods wrap results - into :class:`GenotypePhenotypeAnalysisResult`. - """ - # TODO: remove and use the analyses described in `User Guide > Statistical tests`. - - def __init__( - self, - hpo: hpotk.MinimalOntology, - protein_service: ProteinMetadataService, - ): - self._hpo = hpotk.util.validate_instance(hpo, hpotk.MinimalOntology, 'hpo') - self._protein_service = protein_service - self._protein_predicates = ProteinPredicates(self._protein_service) - - @abc.abstractmethod - def compare_hpo_vs_genotype( - self, - predicate: VariantPredicate, - ) -> GenotypePhenotypeAnalysisResult: - """ - Bin patients according to a presence of at least one allele that matches `predicate` - and test for genotype-phenotype correlations. - """ - pass - - @abc.abstractmethod - def compare_hpo_vs_recessive_genotype( - self, - predicate: VariantPredicate, - ) -> GenotypePhenotypeAnalysisResult: - """ - Bin patients according to a presence of zero, one, or two alleles that matche the `predicate` - and test for genotype-phenotype correlations. - """ - pass - - @abc.abstractmethod - def compare_hpo_vs_genotype_groups( - self, - predicates: typing.Iterable[VariantPredicate], - group_names: typing.Iterable[str], - ) -> GenotypePhenotypeAnalysisResult: - """ - Bin patients according to a presence of at least one allele that matches - any of the provided `predicates` and test for genotype-phenotype correlations - between the groups. - - Note, the patients that pass testing by >1 genotype predicate are *OMITTED* from the analysis! - """ - pass - - @abc.abstractmethod - def compare_disease_vs_genotype( - self, - predicate: VariantPredicate, - disease_ids: typing.Optional[typing.Sequence[typing.Union[str, hpotk.TermId]]] = None, - ) -> GenotypePhenotypeAnalysisResult: - pass - - def compare_genotype_vs_phenotype_group_count( - self, - gt_predicate: GenotypePolyPredicate, - phenotype_group_terms: typing.Iterable[typing.Union[str, hpotk.TermId]], - ) -> PhenotypeScoreAnalysisResult: - # TODO: separate into pscore module - assert isinstance(gt_predicate, GenotypePolyPredicate) - assert gt_predicate.n_categorizations() == 2 - - counting_scorer = CountingPhenotypeScorer.from_query_curies( - hpo=self._hpo, - query=phenotype_group_terms, - ) - - return self.compare_genotype_vs_phenotype_score( - gt_predicate=gt_predicate, - phenotype_scorer=counting_scorer, - ) - - @abc.abstractmethod - def compare_genotype_vs_phenotype_score( - self, - gt_predicate: GenotypePolyPredicate, - phenotype_scorer: PhenotypeScorer, - ) -> PhenotypeScoreAnalysisResult: - """ - Score the patients with a phenotype scoring method and test for correlation between the genotype group - and the phenotype score. - - Args: - gt_predicate: a genotype predicate for binning the patients along the genotype axis. - phenotype_scorer: a callable that computes a phenotype score for a given `Patient`. - """ - pass - - @abc.abstractmethod - def compare_genotype_vs_cohort_phenotypes( - self, - gt_predicate: GenotypePolyPredicate, - ) -> GenotypePhenotypeAnalysisResult: - pass - - @abc.abstractmethod - def compare_genotype_vs_phenotypes( - self, - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], - ): - """ - All analysis functions go through this function. - - The genotype predicate will partition the individuals into non-overlapping groups - along the genotype axis. - The phenotype predicates represent the phenotypes we want to test. - Less phenotypes may actually be tested thanks to :class:`~gpsea.analysis.PhenotypeMtcFilter`. - - Args: - gt_predicate: a predicate for binning the individuals along the genotype axis - pheno_predicates: phenotype predicates for test the individuals along the phenotype axis - """ - pass - - @staticmethod - def _check_min_perc_patients_w_hpo(min_perc_patients_w_hpo: typing.Union[int, float], - cohort_size: int) -> float: - """ - Check if the input meets the requirements. - """ - if isinstance(min_perc_patients_w_hpo, int): - if min_perc_patients_w_hpo > 0: - return min_perc_patients_w_hpo / cohort_size - else: - raise ValueError(f'`min_perc_patients_w_hpo` must be a positive `int` ' - f'but got {min_perc_patients_w_hpo}') - elif isinstance(min_perc_patients_w_hpo, float): - if 0 < min_perc_patients_w_hpo <= 1: - return min_perc_patients_w_hpo - else: - raise ValueError(f'`min_perc_patients_w_hpo` must be a `float` in range (0, 1] ' - f'but got {min_perc_patients_w_hpo}') - else: - raise ValueError(f'`min_perc_patients_w_hpo` must be a positive `int` or a `float` in range (0, 1] ' - f'but got {type(min_perc_patients_w_hpo)}') - - @staticmethod - def _group_patients_by_hpo(phenotypic_features: typing.Iterable[hpotk.TermId], - patients: typing.Iterable[Patient], - hpo: hpotk.GraphAware, - missing_implies_excluded: bool) -> PatientsByHPO: - all_with_hpo = defaultdict(list) - all_without_hpo = defaultdict(list) - for hpo_term in phenotypic_features: - for patient in patients: - found = False - for pf in patient.present_phenotypes(): - if hpo_term == pf.identifier or hpo.graph.is_ancestor_of(hpo_term, pf): - # Patient is annotated with `hpo_term` because `pf` is equal to `hpo_term` - # or it is a descendant of `hpo_term`. - all_with_hpo[hpo_term].append(patient) - - # If one `pf` of the patient is found to be a descendant of `hpo`, we must break to prevent - # adding the patient to `present_hpo` more than once due to another descendant! - found = True - break - if not found: - # The patient is not annotated by the `hpo_term`. - - if missing_implies_excluded: - # The `hpo_term` annotation is missing, hence implicitly excluded. - all_without_hpo[hpo_term].append(patient) - else: - # The `hpo_term` must be explicitly excluded patient to be accounted for. - for ef in patient.excluded_phenotypes(): - if hpo_term == ef.identifier or hpo.graph.is_descendant_of(hpo_term, ef): - all_with_hpo[hpo_term].append(patient) - break - - return PatientsByHPO(all_with_hpo, all_without_hpo) diff --git a/src/gpsea/analysis/_config.py b/src/gpsea/analysis/_config.py deleted file mode 100644 index 2c6d68c63..000000000 --- a/src/gpsea/analysis/_config.py +++ /dev/null @@ -1,352 +0,0 @@ -import enum -import logging -import os -import typing -import warnings - -import hpotk - -from gpsea.config import get_cache_dir_path -from gpsea.model import Cohort -from gpsea.preprocessing import ProteinMetadataService -from gpsea.preprocessing import configure_default_protein_metadata_service as backup_pms -from .mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter -from ._api import CohortAnalysis -from ._gp_analysis import FisherExactAnalyzer -from ._gp_impl import GpCohortAnalysis - -P_VAL_OPTIONS = ( - 'bonferroni', 'b', - 'sidak', 's', - 'holm-sidak', 'hs', - 'holm', 'h', - 'simes-hochberg', 'sh', - 'hommel', 'ho', - 'fdr_bh', 'fdr_by', - 'fdr_tsbh', 'fdr_tsbky', - 'fdr_gbs', - None, -) - - -class MtcStrategy(enum.Enum): - """ - A strategy for mitigating the multiple testing correction (MTC) burden. - """ - - ALL_PHENOTYPE_TERMS = 0 - """ - All phenotype terms (HPO or disease IDs) will be tested. - """ - - SPECIFY_TERMS = 1 - """ - Only the manually provided HPO terms will be tested. - """ - - HPO_MTC = 2 - """ - Only HPO terms present in at least a certain fraction of patients will be tested. - """ - - -class CohortAnalysisConfiguration: - """ - `CohortAnalysisConfiguration` is a value class for storing :class:`~gpsea.analysis.CohortAnalysis` - configuration options. - - The class contains the default values upon creation and the configuration option values can be set as properties. - - If an invalid value option is passed to the property setter, a warning is logged and the previous value is retained. - Therefore, it is *impossible* to mis-configure the analysis. - - Default values - ^^^^^^^^^^^^^^ - - ============================== ======================= ========================================= - Option Type Default value - ============================== ======================= ========================================= - ``pval_correction`` `str` `bonferroni` - ``min_n_patients_with_term`` `int` `2` - ``mtc_alpha`` `float` `0.05` - ``include_sv`` `bool` `False` - ``mtc_strategy`` :class:`MtcStrategy` :class:`MtcStrategy.ALL_PHENOTYPE_TERMS` - ============================== ======================= ========================================= - - """ - - def __init__(self): - self._logger = logging.getLogger(__name__) - self._pval_correction = 'bonferroni' - self._mtc_alpha = .05 - self._min_n_patients_with_term = 2 - self._include_sv = False - self._mtc_strategy = MtcStrategy.ALL_PHENOTYPE_TERMS - self._terms_to_test = None # # only relevant for SPECIFIED_TERMS strategy - self._min_patients_w_hpo = None # # only relevant for HPO_MTC strategy - - @property - def pval_correction(self) -> typing.Optional[str]: - """ - Get method for multiple testing p value correction. Default: `bonferroni`. - """ - return self._pval_correction - - @pval_correction.setter - def pval_correction(self, pval_correction: typing.Optional[str]): - """ - Set the method for p value correction. - - See Statsmodels' - `documentation `_ - for the acceptable values. - - :param pval_correction: a `str` with the name of the desired multiple testing correction method or `None` - if no MTC should be applied. - """ - if pval_correction in P_VAL_OPTIONS: - self._pval_correction = pval_correction - else: - self._logger.warning('Ignoring invalid `pval_correction` value %s. Using %s correction.', pval_correction, - self._pval_correction) - - @property - def min_n_patients_with_term(self) -> int: - """ - Get the minimum number of patients that must be annotated with an HPO term - for including the term in the analysis. - """ - return self._min_n_patients_with_term - - @min_n_patients_with_term.setter - def min_n_patients_with_term(self, value: int): - if isinstance(value, int) and value >= 0: - self._min_n_patients_with_term = value - else: - self._logger.warning( - 'Ignoring invalid `min_n_patients_with_term` value %s. Using %s', - value, self._min_n_patients_with_term, - ) - - @property - def mtc_alpha(self) -> float: - """ - The alpha value for multiple testing correction. - """ - return self._mtc_alpha - - @mtc_alpha.setter - def mtc_alpha(self, mtc_alpha: float): - """ - Set new multiple testing correction alpha value. - - :param mtc_alpha: a `float` in range :math:`(0,1]`. - """ - if isinstance(mtc_alpha, float) and 0. < mtc_alpha <= 1.: - self._mtc_alpha = mtc_alpha - else: - self._logger.warning( - '`mtc_alpha` should be a `float` in range `(0, 1]` but was %s. Keeping the previous value %s', - mtc_alpha, self._mtc_alpha, - ) - - @property - def include_sv(self) -> bool: - """ - `True` if we want to include structural variants in the analysis - (i.e. the variants that use symbolic VCF notation). - """ - return self._include_sv - - @include_sv.setter - def include_sv(self, include_sv: bool): - """ - Set `include_sv` option. - - :param include_sv: a `bool` with the value. - """ - if isinstance(include_sv, bool): - self._include_sv = include_sv - else: - self._logger.warning('Ignoring invalid `include_sv` value %s. Using %s', include_sv, self._include_sv) - - @property - def mtc_strategy(self) -> MtcStrategy: - """ - Get the MTC filtering strategy to be used. - """ - return self._mtc_strategy - - def all_terms_strategy(self): - """ - Test all phenotype terms. - - See :ref:`use-all-terms-strategy` for more info. - """ - self._mtc_strategy = MtcStrategy.ALL_PHENOTYPE_TERMS - self._min_patients_w_hpo = None - self._terms_to_test = None - - def hpo_mtc_strategy( - self, - min_patients_w_hpo: float = 0.2, - ): - """ - Only test the HPO terms that pass all rules of the HPO filter strategy. - - See :ref:`hpo-mtc-filter-strategy` section for more info on the rules. - - :param threshold_HPO_observed_frequency: a float in range :math:`(0, 1]` to represent - the minimum fraction of patients for an HPO term to be included. - """ - if not isinstance(min_patients_w_hpo, float): - raise ValueError(f'`min_patients_w_hpo` is not a `float`: {min_patients_w_hpo}') - if not 0 < min_patients_w_hpo <= 1: - raise ValueError(f'`min_patients_w_hpo` must be in range (0, 1] but was {min_patients_w_hpo}') - - self._mtc_strategy = MtcStrategy.HPO_MTC - self._min_patients_w_hpo = min_patients_w_hpo - self._terms_to_test = None - - def specify_terms_strategy( - self, - terms_to_test: typing.Iterable[typing.Union[str, hpotk.TermId]], - ): - """ - Mitigate the MTC burden by only testing the specified HPO terms. - - The HPO terms are validated before running the analysis, - to point out invalid CURIE (e.g. `WHATEVER`) values, - or HPO term IDs that are not in the currently used HPO. - - Calling this method will clear any previously specified terms. - - See :ref:`specify-terms-strategy` for more info. - - :param terms_to_test: an iterable with CURIEs (e.g. `HP:0001250`) - or :class:`hpotk.TermId` instances representing the terms to test. - """ - self._mtc_strategy = MtcStrategy.SPECIFY_TERMS - self._min_patients_w_hpo = None - self._terms_to_test = tuple(terms_to_test) - - @property - def terms_to_test(self) -> typing.Optional[typing.Iterable[typing.Union[str, hpotk.TermId]]]: - """ - Get the ids of the terms to be tested in `specify_terms_strategy` - or `None` if :class:`MtcStrategy.SPECIFY_TERMS` will *not* be used. - """ - return self._terms_to_test - - @property - def min_patients_w_hpo(self) -> typing.Optional[float]: - """ - Get the minimum fraction of patients needed to be annotated with an HPO term - to justify being tested or `None` if :class:`MtcStrategy.HPO_MTC` will *not* be used. - """ - return self._min_patients_w_hpo - - -def configure_cohort_analysis( - cohort: Cohort, - hpo: hpotk.MinimalOntology, - protein_source: str = 'UNIPROT', - cache_dir: typing.Optional[str] = None, - config: typing.Optional[CohortAnalysisConfiguration] = None, -) -> CohortAnalysis: - """ - Configure :class:`~gpsea.analysis.CohortAnalysis` for given `cohort`. - - :param cohort: a :class:`~gpsea.model.Cohort` to analyze - :param hpo: a :class:`~hpotk.MinimalOntology` with HPO to use in the analysis - :param protein_source: the resource to retrieve protein annotations from if we cannot find the annotations locally. - Choose from ``{'UNIPROT'}`` (just one fallback implementation is available at the moment). - :param config: an optional :class:`CohortAnalysisConfiguration` to parameterize the analysis. - The default parameters will be used if `None`. - """ - if config is None: - config = CohortAnalysisConfiguration() # Use the default config - - cache_path = get_cache_dir_path(cache_dir) - os.makedirs(cache_path, exist_ok=True) - - cache_dir = str(cache_path) - protein_metadata_service = backup_pms(protein_source, cache_dir) - - mtc_filter: PhenotypeMtcFilter - if config.mtc_strategy == MtcStrategy.HPO_MTC: - assert config.min_patients_w_hpo is not None, '`min_patients_w_hpo` must be set if using `HPO_MTC` strategy' - mtc_filter = HpoMtcFilter.default_filter( - hpo=hpo, - term_frequency_threshold=config.min_patients_w_hpo, - ) - elif config.mtc_strategy == MtcStrategy.SPECIFY_TERMS: - assert config.terms_to_test is not None, '`terms_to_test` must be set if using `SPECIFY_TERMS` strategy' - validated_terms_to_test = _validate_terms_to_test(hpo, config.terms_to_test) - mtc_filter = SpecifiedTermsMtcFilter(hpo=hpo, terms_to_test=validated_terms_to_test) - elif config.mtc_strategy == MtcStrategy.ALL_PHENOTYPE_TERMS: - mtc_filter = UseAllTermsMtcFilter() - else: - raise ValueError(f"Did not recognize MtcStrategy {config.mtc_strategy}") - - # Choosing a simple Fisher's exact test for now. - gp_analyzer = FisherExactAnalyzer( - mtc_filter=mtc_filter, - p_val_correction=config.pval_correction, - mtc_alpha=config.mtc_alpha, - ) - - return GpCohortAnalysis( - cohort=cohort, - hpo=hpo, - protein_service=protein_metadata_service, - gp_analyzer=gp_analyzer, - min_n_of_patients_with_term=config.min_n_patients_with_term, - include_sv=config.include_sv, - ) - - -def _validate_terms_to_test( - hpo: hpotk.MinimalOntology, - terms_to_test: typing.Iterable[typing.Union[hpotk.TermId, str]], -) -> typing.Iterable[hpotk.TermId]: - """ - Check that: - * all terms to test are valid TermIds/CURIES, - * the term IDs are in used HPO, and - * there is at least one term to test - """ - validated_terms_to_test = set() - - for term in terms_to_test: - if isinstance(term, hpotk.TermId): - pass - elif isinstance(term, str): - term = hpotk.TermId.from_curie(term) - else: - raise ValueError(f'{term} is neither a TermId nor a CURIE `str`!') - - if term not in hpo: - raise ValueError(f"HPO ID {term} not in HPO ontology {hpo.version}") - validated_terms_to_test.add(term) - if len(validated_terms_to_test) == 0: - raise ValueError('Cannot run use {MTC_Strategy.SPECIFY_TERMS} with no HPO terms!') - - return validated_terms_to_test - - -def configure_default_protein_metadata_service( - protein_source: str = 'UNIPROT', - cache_dir: typing.Optional[str] = None, -) -> ProteinMetadataService: - """ - Create default protein metadata service that will cache the protein metadata - in current working directory under `.gpsea_cache/protein_cache` - and reach out to UNIPROT REST API if a cache entry is missing. - """ - # TODO: remove at some point. - warnings.warn( - 'Use gpsea.preprocessing.configure_default_protein_metadata_service` instead', - DeprecationWarning, stacklevel=2, - ) - return backup_pms(protein_source=protein_source, cache_dir=cache_dir) diff --git a/src/gpsea/analysis/_gp_analysis.py b/src/gpsea/analysis/_gp_analysis.py deleted file mode 100644 index f786049ce..000000000 --- a/src/gpsea/analysis/_gp_analysis.py +++ /dev/null @@ -1,202 +0,0 @@ -import abc -import typing - -import pandas as pd - -from statsmodels.stats import multitest - -from gpsea.model import Patient -from .mtc_filter import PhenotypeMtcFilter -from .predicate import PatientCategory -from .predicate.genotype import GenotypePolyPredicate -from .predicate.phenotype import PhenotypePolyPredicate, P -from ._api import GenotypePhenotypeAnalysisResult, HpoMtcReport -from ._stats import run_fisher_exact, run_recessive_fisher_exact - - -def apply_predicates_on_patients( - patients: typing.Iterable[Patient], - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], - gt_predicate: GenotypePolyPredicate, -) -> typing.Tuple[ - typing.Collection[PatientCategory], - typing.Mapping[P, int], - typing.Mapping[P, pd.DataFrame], -]: - """ - Apply the phenotype predicates `pheno_predicates` and the genotype predicate `gt_predicate` - to bin the `patients` into categories. - - Note, it may not be possible to bin *all* patients with a genotype/phenotype pair, - since a predicate is allowed to return `None` (e.g. if it bins the patient into MISSENSE or NONSENSE groups - but the patient has no MISSENSE or NONSENSE variants). If this happens, the patient will not be "usable" - for the phenotype `P`. - - Args: - patients: an iterable with the patients to bin into categories - pheno_predicates: an iterable with the phenotype predicates to apply - gt_predicate: a genotype predicate to apply - - Returns: - a tuple with 3 items: - - a collection of unique :class:`PatientCategory` items that the patients were binned into - - a mapping from a phenotype :class:`P` (e.g. an HPO term or a disease) - to an `int` with count of patients that could be binned according to the phenotype `P` - - a mapping from phenotype :class:`P` to a data frame with counts of patients - in i-th phenotype category and j-th genotype category where i and j are rows and columns of the data frame - """ - # TODO: delete with no replacement. - phenotypes = set() - categories = set() - for predicate in pheno_predicates: - categories.update(predicate.get_categories()) - phenotypes.add(predicate.phenotype) - - n_usable_patients = pd.Series(data=0, index=pd.Index(phenotypes)) - - # Apply genotype and phenotype predicates - counts = {} - for ph_predicate in pheno_predicates: - if ph_predicate.phenotype not in counts: - # Make an empty frame for keeping track of the counts. - counts[ph_predicate.phenotype] = pd.DataFrame( - data=0, - index=pd.Index( - data=ph_predicate.get_categories(), - name=ph_predicate.get_question_base(), - ), - columns=pd.Index( - data=gt_predicate.get_categories(), - name=gt_predicate.get_question_base(), - ), - ) - - for patient in patients: - pheno_cat = ph_predicate.test(patient) - geno_cat = gt_predicate.test(patient) - - if pheno_cat is not None and geno_cat is not None: - counts[pheno_cat.phenotype].loc[pheno_cat.category, geno_cat.category] += 1 - n_usable_patients[pheno_cat.phenotype] += 1 - - return categories, n_usable_patients, counts - - -class GPAnalyzer(typing.Generic[P], metaclass=abc.ABCMeta): - """ - `GPAnalyzer` calculates p values for genotype-phenotype correlation of phenotypic features of interest. - """ - # TODO: delete with no replacement. - - @abc.abstractmethod - def analyze( - self, - patients: typing.Iterable[Patient], - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], - gt_predicate: GenotypePolyPredicate, - ) -> GenotypePhenotypeAnalysisResult: - """ - Test for association between `phenotypic_features` of interest groups or `patients` determined - by the `predicate`. - - :return: :class:`GenotypePhenotypeAnalysisResult` object with the results. - """ - pass - - -class FisherExactAnalyzer(typing.Generic[P], GPAnalyzer[P]): - """ - `FisherExactAnalyzer` uses Fisher's exact test to calculate p value for phenotypic features of interest. - - Following the test, the code applies one of the multiple testing corrections provided by - :func:`statsmodels.stats.multitest.multipletests` function at given `mtc_alpha`. - - :param p_val_correction: a `str` with name of the multiple testing correction method or `None` if no correction - should be applied. - :param mtc_alpha: a `float` in range :math:`(0, 1]` with the multiple testing correction alpha value. - """ - # TODO: delete and use `gpsea.analysis.pcats.MultiPhenotypeAnalysis`. - - def __init__( - self, - mtc_filter: PhenotypeMtcFilter, - p_val_correction: typing.Optional[str] = None, - mtc_alpha: float = .05, - ): - self._correction = p_val_correction - self._mtc_alpha = mtc_alpha - self._mtc_filter = mtc_filter - - def analyze( - self, - patients: typing.Iterable[Patient], - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], - gt_predicate: GenotypePolyPredicate, - ) -> GenotypePhenotypeAnalysisResult: - pheno_predicates = tuple(pheno_predicates) - if len(pheno_predicates) == 0: - raise ValueError('No phenotype predicates were provided') - - # 1) Count the patients - categories, n_usable, all_counts = apply_predicates_on_patients( - patients, pheno_predicates, gt_predicate, - ) - n_terms_before_filtering = len(n_usable) - - # 1.5) Filter terms for MTC - n_usable_filtered, all_counts_filtered, reason2count = self._mtc_filter.filter_terms_to_test( - gt_predicate=gt_predicate, - n_usable=n_usable, - all_counts=all_counts, - ) - if len(n_usable_filtered) == 0: - raise ValueError("No phenotypes are left for the analysis after MTC filtering step") - - assert len(n_usable_filtered) == len(all_counts_filtered) - - # 2) Statistical tests - pheno_idx = pd.Index(all_counts_filtered.keys(), name='p_val') - pvals = pd.Series(float('nan'), index=pheno_idx, name='p value') - for phenotype in pheno_idx: - counts = all_counts_filtered[phenotype] - # TODO - this is where we must fail unless we have the contingency table of the right size! - if counts.shape == (2, 2): - pvals[phenotype] = run_fisher_exact(counts) - elif counts.shape == (2, 3): - pvals[phenotype] = run_recessive_fisher_exact(counts) - else: - raise ValueError( - "Invalid number of categories. " - f"A {counts.shape} table was created. Only (2, 2) and (2, 3) are valid sizes." - ) - - # 3) Multiple correction - if self._correction is not None: - _, pvals_corrected, _, _ = multitest.multipletests(pvals, alpha=self._mtc_alpha, method=self._correction) - corrected_idx = pd.Index(all_counts_filtered.keys(), name='p_val_corrected') - corrected_pvals_series = pd.Series( - data=pvals_corrected, index=corrected_idx, name='Corrected p value', - ) - else: - corrected_pvals_series = None - - mtc_method = "none" - if self._correction is not None: - mtc_method = self._correction - mtc_filter_report = HpoMtcReport( - filter_name=self._mtc_filter.filter_method_name(), - mtc_name=mtc_method, - filter_results_map=reason2count, - n_terms_before_filtering=n_terms_before_filtering, - ) - - # 4) Wrap up - return GenotypePhenotypeAnalysisResult( - n_usable=n_usable_filtered, - all_counts=all_counts_filtered, - pvals=pvals, - corrected_pvals=corrected_pvals_series, - phenotype_categories=categories, - geno_predicate=gt_predicate, - mtc_filter_report=mtc_filter_report - ) diff --git a/src/gpsea/analysis/_gp_impl.py b/src/gpsea/analysis/_gp_impl.py deleted file mode 100644 index 1d01045c0..000000000 --- a/src/gpsea/analysis/_gp_impl.py +++ /dev/null @@ -1,195 +0,0 @@ -import logging -import typing - -import hpotk -import pandas as pd - -from scipy.stats import mannwhitneyu - -from gpsea.model import Cohort -from gpsea.preprocessing import ProteinMetadataService -from .pscore import PhenotypeScorer -from .predicate.genotype import GenotypePolyPredicate, VariantPredicate -from .predicate.genotype import boolean_predicate as wrap_as_boolean_predicate -from .predicate.genotype import groups_predicate as wrap_as_groups_predicate -from .predicate.genotype import recessive_predicate as wrap_as_recessive_predicate -from .predicate.phenotype import PhenotypePolyPredicate, P, PropagatingPhenotypePredicate, DiseasePresencePredicate - -from ._api import CohortAnalysis, GenotypePhenotypeAnalysisResult, PhenotypeScoreAnalysisResult -from ._gp_analysis import GPAnalyzer -from ._util import prepare_hpo_terms_of_interest - - -class GpCohortAnalysis(CohortAnalysis): - - def __init__( - self, cohort: Cohort, - hpo: hpotk.MinimalOntology, - protein_service: ProteinMetadataService, - gp_analyzer: GPAnalyzer, - min_n_of_patients_with_term: int, - include_sv: bool = False, - ): - super().__init__( - hpo, - protein_service, - ) - if not isinstance(cohort, Cohort): - raise ValueError(f"cohort must be type Cohort but was type {type(cohort)}") - - self._logger = logging.getLogger(__name__) - self._cohort = cohort - # self._phenotype_filter = hpotk.util.validate_instance(phenotype_filter, PhenotypeFilter, 'phenotype_filter') - self._gp_analyzer = hpotk.util.validate_instance(gp_analyzer, GPAnalyzer, 'gp_analyzer') - - self._patient_list = list(cohort.all_patients) \ - if include_sv \ - else [pat for pat in cohort.all_patients if not all(var.variant_info.is_structural() for var in pat.variants)] - if len(self._patient_list) == 0: - raise ValueError('No patients left for analysis!') - - self._hpo_terms_of_interest = prepare_hpo_terms_of_interest( - hpo=self._hpo, - patients=self._patient_list, - min_n_of_patients_with_term=min_n_of_patients_with_term, - ) - - def compare_hpo_vs_genotype( - self, - predicate: VariantPredicate, - ) -> GenotypePhenotypeAnalysisResult: - """ - Bin patients according to a presence of at least one allele that matches `predicate` - and test for genotype-phenotype correlations. - """ - assert isinstance(predicate, VariantPredicate), \ - f'{type(predicate)} is not an instance of `VariantPredicate`' - bool_predicate = wrap_as_boolean_predicate(predicate) - return self.compare_genotype_vs_cohort_phenotypes(bool_predicate) - - def compare_hpo_vs_recessive_genotype( - self, - predicate: VariantPredicate, - ) -> GenotypePhenotypeAnalysisResult: - """ - Bin patients according to a presence of zero, one, or two alleles that matche the `predicate` - and test for genotype-phenotype correlations. - """ - rec_predicate = wrap_as_recessive_predicate(predicate) - return self.compare_genotype_vs_cohort_phenotypes(rec_predicate) - - def compare_hpo_vs_genotype_groups( - self, - predicates: typing.Iterable[VariantPredicate], - group_names: typing.Iterable[str], - ) -> GenotypePhenotypeAnalysisResult: - """ - Bin patients according to a presence of at least one allele that matches - any of the provided `predicates` and test for genotype-phenotype correlations - between the groups. - - Note, the patients that pass testing by >1 genotype predicate are *OMITTED* from the analysis! - """ - predicate = wrap_as_groups_predicate( - predicates=predicates, - group_names=group_names, - ) - return self.compare_genotype_vs_cohort_phenotypes(predicate) - - def compare_disease_vs_genotype( - self, - predicate: VariantPredicate, - disease_ids: typing.Optional[typing.Sequence[typing.Union[str, hpotk.TermId]]] = None, - ) -> GenotypePhenotypeAnalysisResult: - pheno_predicates = self._prepare_disease_predicates(disease_ids) - - # This can be updated to any genotype poly predicate in future, if necessary. - genotype_predicate = wrap_as_boolean_predicate(predicate) - return self.compare_genotype_vs_phenotypes(genotype_predicate, pheno_predicates) - - def _prepare_disease_predicates( - self, - disease_ids: typing.Optional[typing.Sequence[typing.Union[str, hpotk.TermId]]], - ) -> typing.Sequence[DiseasePresencePredicate]: - testing_diseases = [] - if disease_ids is None: - disease_ids = [dis.identifier for dis in self._cohort.all_diseases()] - if len(disease_ids) < 1: - raise ValueError("No diseases available for testing.") - for disease_id in disease_ids: - if isinstance(disease_id, str): - testing_diseases.append(hpotk.TermId.from_curie(disease_id)) - elif isinstance(disease_id, hpotk.TermId): - testing_diseases.append(disease_id) - else: - raise ValueError(f'{disease_id} must be a `str` or a `hpotk.TermId`') - pheno_predicates = [] - for disease in testing_diseases: - pheno_predicates.append(DiseasePresencePredicate(disease)) - return pheno_predicates - - def compare_genotype_vs_phenotype_score( - self, - gt_predicate: GenotypePolyPredicate, - phenotype_scorer: PhenotypeScorer, - ) -> PhenotypeScoreAnalysisResult: - idx = pd.Index(tuple(p.patient_id for p in self._patient_list), name='patient_id') - data = pd.DataFrame( - None, - index=idx, - columns=['genotype', 'phenotype'], - ) - - # Apply the predicates on the patients - for patient in self._patient_list: - gt_cat = gt_predicate.test(patient) - data.loc[patient.patient_id, 'genotype'] = None if gt_cat is None else gt_cat.category.cat_id - data.loc[patient.patient_id, 'phenotype'] = phenotype_scorer.score(patient) - - # To improve the determinism - data.sort_index(inplace=True) - - # Sort by PatientCategory.cat_id and unpack. - # For now, we only allow to have up to 2 groups. - x_key, y_key = sorted(data['genotype'].dropna().unique()) - x = data.loc[data['genotype'] == x_key, 'phenotype'].to_numpy(dtype=float) - y = data.loc[data['genotype'] == y_key, 'phenotype'].to_numpy(dtype=float) - result = mannwhitneyu( - x=x, - y=y, - alternative='two-sided', - ) - - return PhenotypeScoreAnalysisResult( - genotype_phenotype_scores=data, - p_value=float(result.pvalue), - ) - - def compare_genotype_vs_cohort_phenotypes( - self, - gt_predicate: GenotypePolyPredicate, - ) -> GenotypePhenotypeAnalysisResult: - assert isinstance(gt_predicate, GenotypePolyPredicate), \ - f'{type(gt_predicate)} is not an instance of `GenotypePolyPredicate`' - - pheno_predicates = self._prepare_phenotype_predicates() - return self.compare_genotype_vs_phenotypes(gt_predicate, pheno_predicates) - - def _prepare_phenotype_predicates(self) -> typing.Sequence[PhenotypePolyPredicate[P]]: - return tuple( - PropagatingPhenotypePredicate( - hpo=self._hpo, - query=query, - ) - for query in self._hpo_terms_of_interest) - - def compare_genotype_vs_phenotypes( - self, - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], - ) -> GenotypePhenotypeAnalysisResult: - return self._gp_analyzer.analyze( - patients=self._patient_list, - pheno_predicates=pheno_predicates, - gt_predicate=gt_predicate, - ) diff --git a/src/gpsea/analysis/_stats.py b/src/gpsea/analysis/_stats.py deleted file mode 100644 index 1364e7d56..000000000 --- a/src/gpsea/analysis/_stats.py +++ /dev/null @@ -1,154 +0,0 @@ -import abc -import math -import typing -from decimal import Decimal - -import numpy as np -import scipy - -# TODO: delete the module - - -class MultiFisherExact(metaclass=abc.ABCMeta): - - @abc.abstractmethod - def calculate(self, a: np.ndarray) -> float: - """ - :param a: a 2x3 int array with counts - :returns: a p value calculated with Fisher's exact test - """ - pass - - @staticmethod - def _check_input(a: np.ndarray): - if not isinstance(a, np.ndarray): - raise ValueError(f'Expected a numpy array but got {type(a)}') - if not a.shape == (2, 3): - raise ValueError(f'Shape of the array must be (2, 3) but got {a.shape}') - if np.array_equal(a, np.zeros_like(a)): - raise ValueError(f'Array is all zeros, cannot run analysis') - - -class PythonMultiFisherExact(MultiFisherExact): - - def calculate(self, a: np.ndarray) -> float: - MultiFisherExact._check_input(a) - return self._fisher_exact(a) - - def _fisher_exact(self, table): - row_sum = [] - col_sum = [] - - for i in range(len(table)): - temp = 0 - for j in range(len(table[0])): - temp += table[i][j] - row_sum.append(temp) - - for j in range(len(table[0])): - temp = 0 - for i in range(len(table)): - temp += table[i][j] - col_sum.append(temp) - - mat = [[0] * len(col_sum)] * len(row_sum) - pos = (0, 0) - - p_0 = 1 - - for x in row_sum: - p_0 *= math.factorial(x) - for y in col_sum: - p_0 *= math.factorial(y) - - n = 0 - for x in row_sum: - n += x - p_0 /= Decimal(math.factorial(n)) - - for i in range(len(table)): - for j in range(len(table[0])): - p_0 /= Decimal(math.factorial(table[i][j])) - - p = [0] - self._dfs(mat, pos, row_sum, col_sum, p_0, p) - - return float(p[0]) - - def _dfs(self, mat, pos, r_sum, c_sum, p_0, p): - - (xx, yy) = pos - (r, c) = (len(r_sum), len(c_sum)) - - mat_new = [] - - for i in range(len(mat)): - temp = [] - for j in range(len(mat[0])): - temp.append(mat[i][j]) - mat_new.append(temp) - - if xx == -1 and yy == -1: - for i in range(r - 1): - temp = r_sum[i] - for j in range(c - 1): - temp -= mat_new[i][j] - mat_new[i][c - 1] = temp - for j in range(c - 1): - temp = c_sum[j] - for i in range(r - 1): - temp -= mat_new[i][j] - mat_new[r - 1][j] = temp - temp = r_sum[r - 1] - for j in range(c - 1): - temp -= mat_new[r - 1][j] - if temp < 0: - return - mat_new[r - 1][c - 1] = temp - - p_1 = 1 - for x in r_sum: - p_1 *= math.factorial(x) - for y in c_sum: - p_1 *= math.factorial(y) - - n = 0 - for x in r_sum: - n += x - p_1 /= Decimal(math.factorial(n)) - - for i in range(len(mat_new)): - for j in range(len(mat_new[0])): - p_1 /= Decimal(math.factorial(mat_new[i][j])) - if p_1 <= p_0 + Decimal(0.00000001): - # print(mat_new) - # print(p_1) - p[0] += p_1 - else: - max_1 = r_sum[xx] - max_2 = c_sum[yy] - for j in range(c): - max_1 -= mat_new[xx][j] - for i in range(r): - max_2 -= mat_new[i][yy] - for k in range(min(max_1, max_2) + 1): - mat_new[xx][yy] = k - if xx == r - 2 and yy == c - 2: - pos_new = (-1, -1) - elif xx == r - 2: - pos_new = (0, yy + 1) - else: - pos_new = (xx + 1, yy) - self._dfs(mat_new, pos_new, r_sum, c_sum, p_0, p) - - -def run_recessive_fisher_exact(two_by_three_table: typing.Sequence[typing.Sequence[int]]): - a = np.array(two_by_three_table, dtype=np.int64) - test_class = PythonMultiFisherExact() - val = test_class.calculate(a) - return val - - -def run_fisher_exact(two_by_two_table: typing.Sequence[typing.Sequence[int]]): - oddsr, p = scipy.stats.fisher_exact(two_by_two_table, alternative='two-sided') - return p diff --git a/src/gpsea/analysis/_test_fisherExact.py b/src/gpsea/analysis/_test_fisherExact.py deleted file mode 100644 index fd530f5c6..000000000 --- a/src/gpsea/analysis/_test_fisherExact.py +++ /dev/null @@ -1,28 +0,0 @@ -import pytest -import numpy as np -from contextlib import nullcontext as does_not_raise -from ._stats import PythonMultiFisherExact - - -@pytest.fixture -def MultiExact() -> PythonMultiFisherExact: - return PythonMultiFisherExact() - -# TODO: remove - -@pytest.mark.parametrize('table, raise_error, pVal', - ([[[0,0,0],[0,0,0]], pytest.raises(ValueError), None], - [[[2, 1, 0],[3, 0, 2]], does_not_raise(), 0.6429], - #[[[100, 150], [500, 460], [420, 400]], pytest.raises(OverflowError), None], - [[[5,5,5],[5,5,5]], does_not_raise(), 1], - [[[10, 5, 20], [15, 5]], pytest.raises(ValueError), None], - [[[10, 2, 3],[1, 3, 4]], does_not_raise(), 0.0395], - [[[6, 0, 0],[6, 5, 8]], does_not_raise(), 0.0233], - [[[],[]], pytest.raises(ValueError), None] -)) -def test_multiFisherExact(table, raise_error, pVal, MultiExact): - with raise_error: - np_table = np.array(table, dtype=np.int64) - final_pval = MultiExact.calculate(np_table) - assert round(final_pval, 4) == pVal - \ No newline at end of file diff --git a/src/gpsea/analysis/_util.py b/src/gpsea/analysis/_util.py deleted file mode 100644 index 2c3445b56..000000000 --- a/src/gpsea/analysis/_util.py +++ /dev/null @@ -1,61 +0,0 @@ -import typing - -from collections import Counter - -import hpotk - - -from gpsea.model import Patient - - -def prepare_hpo_terms_of_interest( - hpo: hpotk.graph.GraphAware, - patients: typing.Iterable[Patient], - min_n_of_patients_with_term: int, -) -> typing.Collection[hpotk.TermId]: - """ - Prepare a collection of HPO terms to test. - - This includes the direct HPO patient annotations - as well as the ancestors of the present terms and the descendants of the excluded terms. - - :param hpo: an entity with an HPO graph (e.g. :class:`hpotk.MinimalOntology`). - :param patients: an iterable with patients. - :param min_n_of_patients_with_term: the minimum number of patients that must feature an HPO term - (either directly or indirectly) for the term to be included in the analysis. - """ - # TODO remove in favor of `gpsea.analysis.predicate.phenotype` - present_count = Counter() - excluded_count = Counter() - - for patient in patients: - for pf in patient.phenotypes: - if pf.is_present: - # A present phenotypic feature must be counted in. - present_count[pf.identifier] += 1 - # and it also implies presence of its ancestors. - for anc in hpo.graph.get_ancestors(pf): - present_count[anc] += 1 - else: - # An excluded phenotypic feature - excluded_count[pf.identifier] += 1 - for desc in hpo.graph.get_descendants(pf): - # implies exclusion of its descendants. - excluded_count[desc] += 1 - - total_count = Counter() - for term_id, count in present_count.items(): - total_count[term_id] += count - - for term_id, count in excluded_count.items(): - total_count[term_id] += count - - final_hpo = [] - for term_id in present_count: - # Keep the term if it is mentioned at least *n* times (incl. being excluded) - # in the cohort - n_all = total_count[term_id] - if n_all >= min_n_of_patients_with_term: - final_hpo.append(term_id) - - return tuple(final_hpo) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index 86dd261a2..82d2819d5 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,13 +1,13 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, recessive_predicate +from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'recessive_predicate', + 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py index 39cf020be..07d6fa078 100644 --- a/src/gpsea/analysis/predicate/genotype/_api.py +++ b/src/gpsea/analysis/predicate/genotype/_api.py @@ -2,7 +2,7 @@ import typing from gpsea.model import Variant -from .._api import PolyPredicate, Categorization, PatientCategory +from .._api import PolyPredicate, Categorization class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta): @@ -13,15 +13,6 @@ class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta pass -class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta): - BOTH = Categorization(PatientCategory(0, 'Both', 'The patient belongs in both groups.')) - ONE = Categorization(PatientCategory(1, 'One', 'The patient belongs in one of the two groups.')) - NEITHER = Categorization(PatientCategory(2, 'Neither', 'The patient does not belong in either group.')) - - def get_categorizations(self) -> typing.Sequence[Categorization]: - return RecessiveGroupingPredicate.BOTH, RecessiveGroupingPredicate.ONE, RecessiveGroupingPredicate.NEITHER - - class VariantPredicate(metaclass=abc.ABCMeta): """ `VariantPredicate` tests if a variant meets a certain criterion. diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 43a39594a..8693b67bd 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -7,7 +7,7 @@ from gpsea.model import Patient, Sex from .._api import Categorization, PatientCategory, PatientCategories -from ._api import GenotypePolyPredicate, RecessiveGroupingPredicate +from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter @@ -277,70 +277,6 @@ def filtering_predicate( ) -class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate): - # NOT PART OF THE PUBLIC API - # TODO: this predicate is a bit weird and I think it should eventually go away. - # Therefore, I do not write any tests at this point. - - def __init__( - self, - allele_counter: AlleleCounter, - ): - self._allele_counter = allele_counter - - def get_question_base(self) -> str: - return self._allele_counter.get_question() - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - self._check_patient(patient) - - allele_count = self._allele_counter.count(patient) - if allele_count == 0: - return RecessiveGroupingPredicate.NEITHER - elif allele_count == 1: - return RecessiveGroupingPredicate.ONE - elif allele_count == 2: - return RecessiveGroupingPredicate.BOTH - else: - return None - - def __eq__(self, value: object) -> bool: - return ( - isinstance(value, AlleleCountingRecessivePredicate) - and self._allele_counter == value._allele_counter - ) - - def __hash__(self) -> int: - return hash((self._allele_counter,)) - - def __str__(self) -> str: - return ( - f"AlleleCountingRecessivePredicate(allele_counter={self._allele_counter})" - ) - - def __repr__(self) -> str: - return str(self) - - -def recessive_predicate( - variant_predicate: VariantPredicate, -) -> GenotypePolyPredicate: - """ - Create a recessive grouping predicate from given `variant_predicate` - to bin the patient into :class:`RecessiveGroupingPredicate.NEITHER`, - :class:`RecessiveGroupingPredicate.ONE`, or :class:`RecessiveGroupingPredicate.BOTH`, - depending on the number of variant alleles matching the variant predicate. - - The patient is assigned into a group in the following manner: - * 0 alleles: :class:`RecessiveGroupingPredicate.NEITHER` - * 1 alleles: :class:`RecessiveGroupingPredicate.ONE` - * 2 alleles: :class:`RecessiveGroupingPredicate.BOTH` - * other: `None` - """ - allele_counter = AlleleCounter(predicate=variant_predicate) - return AlleleCountingRecessivePredicate(allele_counter=allele_counter) - - @dataclasses.dataclass(eq=True, frozen=True) class GenotypeGroup: allele_count: int diff --git a/tests/analysis/test_pscore.py b/tests/analysis/pscore/test_counting_scorer.py similarity index 100% rename from tests/analysis/test_pscore.py rename to tests/analysis/pscore/test_counting_scorer.py diff --git a/tests/analysis/test_config.py b/tests/analysis/test_config.py deleted file mode 100644 index afff55e1e..000000000 --- a/tests/analysis/test_config.py +++ /dev/null @@ -1,52 +0,0 @@ -import pytest - -from gpsea.analysis import CohortAnalysisConfiguration, MtcStrategy - - -class TestCohortAnalysisConfiguration: - - def test_default_values(self): - config = CohortAnalysisConfiguration() - - assert config.pval_correction == 'bonferroni' - assert config.min_patients_w_hpo is None - assert config.include_sv is False - assert config.mtc_alpha == pytest.approx(.05) - assert config.mtc_strategy == MtcStrategy.ALL_PHENOTYPE_TERMS - assert config.terms_to_test is None - - def test_set_all_terms_strategy(self): - config = CohortAnalysisConfiguration() - assert config.mtc_strategy == MtcStrategy.ALL_PHENOTYPE_TERMS - - config.specify_terms_strategy(('HP:0001250', 'HP:0001166')) - - assert config.mtc_strategy == MtcStrategy.SPECIFY_TERMS - assert config.terms_to_test == ('HP:0001250', 'HP:0001166') - - def test_set_hpo_mtc_strategy(self): - config = CohortAnalysisConfiguration() - assert config.mtc_strategy == MtcStrategy.ALL_PHENOTYPE_TERMS - - config.hpo_mtc_strategy() - - assert config.mtc_strategy == MtcStrategy.HPO_MTC - assert config.min_patients_w_hpo == pytest.approx(0.2) - - @pytest.mark.parametrize( - 'value', - [ - -.0, - 1.01, - ] - ) - def test_cannot_set_invalid_threshold_in_hpo_mtc_strategy( - self, - value: float, - ): - config = CohortAnalysisConfiguration() - - with pytest.raises(ValueError) as e: - config.hpo_mtc_strategy(value) - - assert e.value.args[0] == f'`min_patients_w_hpo` must be in range (0, 1] but was {value}' diff --git a/tests/analysis/test_examples.py b/tests/analysis/test_examples.py deleted file mode 100644 index 8f17161d6..000000000 --- a/tests/analysis/test_examples.py +++ /dev/null @@ -1,137 +0,0 @@ -import typing - -import hpotk -import pytest - -import pandas as pd - -from gpsea.analysis import configure_cohort_analysis, GenotypePhenotypeAnalysisResult -from gpsea.analysis.predicate import PatientCategories -from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate -from gpsea.model import Cohort, VariantEffect - - -# TODO: remove at some point! -@pytest.mark.skip('Obsolete tests') -class TestCohortAnalysis: - - def test_compare_by_variant_effect( - self, - suox_cohort: Cohort, - hpo: hpotk.MinimalOntology, - ): - pd.set_option('expand_frame_repr', False) - cohort_analysis = configure_cohort_analysis(suox_cohort, hpo) - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_001032386.2') - results = cohort_analysis.compare_hpo_vs_genotype(is_missense) - print(results) - summary = results.summarize(hpo, PatientCategories.YES) - print(summary) - - def test_get_count( - self, - suox_cohort: Cohort, - hpo: hpotk.MinimalOntology, - ): - """ - This test shows how to manipulate the results object to get the counts we need - Let's use Arachnodactyly [HP:0001166] Yes (Genotype NO 1/10); No (Genotype YES 13/16) as an example - that is - Arachnodactyly (+) MISSENSE (-) = 1 - Arachnodactyly (-) MISSENSE (-) = 9 - Arachnodactyly (+) MISSENSE (+) = 13 - Arachnodactyly (-) MISSENSE (+) = 3 - In the Toy file, we have - Arachnodactyly TRUE, MISSENSE (snv) TRUE: A,B,D,E;G;J;M;P;Q;R;T;V;Y = 13 - Arachnodactyly FALSE, MISSENSE (snv) TRUE: C,K,N=3 - Arachnodactyly TRUE, MISSENSE (snv) FALSE: H = 1 - Arachnodactyly FALSE, MISSENSE (snv) FALSE: F,I,L,O,S;U,W,X,Z=9 - This is the output - MISSENSE_VARIANT on NM_1234.5 No Yes - Count Percent Count Percent p value Corrected p value - Arachnodactyly [HP:0001166] 1/10 10.0% 13/16 81.25% 0.000781 0.020299 - """ - pd.set_option('expand_frame_repr', False) - cohort_analysis = configure_cohort_analysis(suox_cohort, hpo) - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_001032386.2') - results = cohort_analysis.compare_hpo_vs_genotype(is_missense) - - # Let's make sure we know what class we have - assert isinstance(results, GenotypePhenotypeAnalysisResult) - - # We expect that we have YES and NO as phenotype categories - phenotype_categories_tuple = results.phenotype_categories - assert len(phenotype_categories_tuple) == 2 - assert PatientCategories.YES in phenotype_categories_tuple - assert PatientCategories.NO in phenotype_categories_tuple - - # The all_counts field has the counts of the phenotypes - all_counts = results.all_counts - assert isinstance(all_counts, typing.Mapping) - - # We tested 66 HPO terms - assert len(all_counts) == 66 - - # The index of all_counts is a Tuple with (HPO TermId, BooleanPredicate - # Let's test Seizure - we should have one row for each Patient Predicate - counts = all_counts[hpotk.TermId.from_curie("HP:0001250")] - - # The YES row is Seizure YES -- according to the above, we have 11 (MISSENSE NO) and 17 (MISSENSE YES) - assert counts.loc[PatientCategories.YES, PatientCategories.NO] == 11 - assert counts.loc[PatientCategories.YES, PatientCategories.YES] == 17 - - # The NO row is Seizure NO -- according to the above, we have 0 (MISSENSE NO) and 7 (MISSENSE YES) - assert counts.loc[PatientCategories.NO, PatientCategories.NO] == 0 - assert counts.loc[PatientCategories.NO, PatientCategories.YES] == 7 - - # In total, 35 patients were categorized - assert counts.sum().sum() == 35 - - def test_get_positive_count( - self, - suox_cohort: Cohort, - hpo: hpotk.MinimalOntology, - ): - """ - This test shows how to get the counts for positive HPO terms - Let's use Seizure [HP:0001250] Yes (Genotype NO 11/11); No (Genotype YES 17/24) as an example - that is - Seizure (+) MISSENSE (-) = 11 - Seizure (-) MISSENSE (-) = 0 - Seizure (+) MISSENSE (+) = 17 - Seizure (-) MISSENSE (+) = 7 - This means we expect 11+17=28 - See the previous test for further information - """ - pd.set_option('expand_frame_repr', False) - cohort_analysis = configure_cohort_analysis(suox_cohort, hpo) - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_001032386.2') - results = cohort_analysis.compare_hpo_vs_genotype(is_missense) - all_counts = results.all_counts - # The index of all_counts is a Tuple with (HPO TermId, BooleanPredicate - # Let's test Arachnodactyly - we should have one row for each Patient Predicate - counts = all_counts[hpotk.TermId.from_curie("HP:0001250")] - # total_observed_HPO = HeuristicSamplerMtcFilter.get_number_of_positive_observations(arachnodactyly_counts) - total_observed_HPO = counts.loc[PatientCategories.YES, PatientCategories.NO] + counts.loc[PatientCategories.YES, PatientCategories.YES] - assert total_observed_HPO == 28 - - def test_compare_symptom_count_vs_genotype( - self, - suox_cohort: Cohort, - hpo: hpotk.MinimalOntology, - ): - cohort_analysis = configure_cohort_analysis(suox_cohort, hpo) - - phenotype_group_terms = ( - 'HP:0012638', # Abnormal nervous system physiology - 'HP:0001939', # Abnormality of metabolism/homeostasis - ) - variant_predicate = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, 'NM_001032386.2') - gt_predicate = boolean_predicate(variant_predicate) - - phenotype_group_results = cohort_analysis.compare_genotype_vs_phenotype_group_count( - gt_predicate=gt_predicate, - phenotype_group_terms=phenotype_group_terms, - ) - - assert phenotype_group_results.p_value == pytest.approx(0.9345982107594922) From e9ff7d45689895719b821abcddb1eddd8f8a352c Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 4 Sep 2024 13:46:18 +0200 Subject: [PATCH 2/2] Update `README.md`. --- README.md | 43 ++++--------------------------------------- 1 file changed, 4 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 9ceab1808..2a498eb42 100644 --- a/README.md +++ b/README.md @@ -5,46 +5,11 @@ GPSEA is a Python library for discovery of genotype-phenotype associations. -An example of simple genotype-phenotype association analysis +See the [Tutorial](https://monarch-initiative.github.io/gpsea/stable/tutorial.html) +and a comprehensive [User guide](https://monarch-initiative.github.io/gpsea/stable/user-guide/index.html) +for more information. -```python -# Load HPO -import hpotk - -store = hpotk.configure_ontology_store() -hpo = store.load_minimal_hpo() - -# Load a cohort of phenopackets -from gpsea.data import get_toy_cohort - -cohort = get_toy_cohort() - -# Analyze genotype-phenotype associations -from gpsea.analysis import configure_cohort_analysis -from gpsea.analysis.predicate import PatientCategories - -from gpsea.model import VariantEffect - -cohort_analysis = configure_cohort_analysis(cohort, hpo) -frameshift = cohort_analysis.compare_by_variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id='NM_1234.5') - -frameshift.summarize(hpo, category=PatientCategories.YES) -``` - -provides a pandas data frame with genotype-phenotype correlations: - -```text -FRAMESHIFT_VARIANT on NM_1234.5 No Yes - Count Percent Count Percent p value Corrected p value - Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.000781 0.020299 - Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000 - Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000 - ... ... ... ... ... ... ... -``` - -## Documentation - -Check out the User guide and the API reference for more info: +The documentation comes in two flavors: - [Stable documentation](https://monarch-initiative.github.io/gpsea/stable/) (last release on `main` branch) - [Latest documentation](https://monarch-initiative.github.io/gpsea/latest) (bleeding edge, latest commit on `develop` branch)