diff --git a/docs/user-guide/analyses/measurements.rst b/docs/user-guide/analyses/measurements.rst index 1cbfc30d9..6d1e078c7 100644 --- a/docs/user-guide/analyses/measurements.rst +++ b/docs/user-guide/analyses/measurements.rst @@ -87,7 +87,7 @@ We use the measurement of `Testosterone [Mass/volume] in Serum or Plasma >> pheno_scorer.description -'The value of Testosterone [Mass/volume] in Serum or Plasma [LOINC:2986-8]' +'Value of Testosterone [Mass/volume] in Serum or Plasma [LOINC:2986-8]' Statistical test diff --git a/docs/user-guide/predicates/hpo_predicate.rst b/docs/user-guide/predicates/hpo_predicate.rst index 2d659f1fd..b259feff1 100644 --- a/docs/user-guide/predicates/hpo_predicate.rst +++ b/docs/user-guide/predicates/hpo_predicate.rst @@ -39,7 +39,7 @@ and now we can set up a predicate to test for presence of *Abnormal lens morphol >>> pheno_predicate.name 'HPO Predicate' >>> pheno_predicate.description -'Test for presence of Abnormal lens morphology' +'Test for presence of Abnormal lens morphology [HP:0000517]' >>> pheno_predicate.group_labels ('Yes', 'No') diff --git a/src/gpsea/analysis/_base.py b/src/gpsea/analysis/_base.py index 1fb0d3dbb..c132f7958 100644 --- a/src/gpsea/analysis/_base.py +++ b/src/gpsea/analysis/_base.py @@ -7,6 +7,7 @@ from .predicate.phenotype import PhenotypePolyPredicate, P from .predicate.genotype import GenotypePolyPredicate +from ._partition import Partitioning class Statistic(metaclass=abc.ABCMeta): @@ -266,12 +267,16 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta): def __init__( self, gt_predicate: GenotypePolyPredicate, + phenotype: Partitioning, statistic: Statistic, data: pd.DataFrame, pval: float, ): super().__init__(gt_predicate, statistic) + assert isinstance(phenotype, Partitioning) + self._phenotype = phenotype + assert isinstance(data, pd.DataFrame) and all( col in data.columns for col in MonoPhenotypeAnalysisResult.DATA_COLUMNS ) @@ -283,6 +288,13 @@ def __init__( raise ValueError( f"`pval` must be a finite float in range [0, 1] but it was {pval}" ) + + @property + def phenotype(self) -> Partitioning: + """ + Get the :class:`~gpsea.analysis.Partitioning` that produced the phenotype. + """ + return self._phenotype @property def data(self) -> pd.DataFrame: @@ -325,12 +337,14 @@ def pval(self) -> float: def __eq__(self, value: object) -> bool: return isinstance(value, MonoPhenotypeAnalysisResult) \ and super(AnalysisResult, self).__eq__(value) \ + and self._phenotype == value._phenotype \ and self._pval == value._pval \ and self._data.equals(value._data) def __hash__(self) -> int: return hash(( super(AnalysisResult, self).__hash__(), + self._phenotype, self._pval, self._data, )) diff --git a/src/gpsea/analysis/_partition.py b/src/gpsea/analysis/_partition.py index 3ae55edbf..4aac168b1 100644 --- a/src/gpsea/analysis/_partition.py +++ b/src/gpsea/analysis/_partition.py @@ -35,7 +35,7 @@ def variable_name(self) -> str: """ Get a `str` with the name of the variable investigated by the partitioning. - For instance `Sex`, `Allele groups`, `HPO term`, `Diagnosis` + For instance `Sex`, `Allele groups`, `HP:0001250`, `OMIM:256000` """ pass diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index e4f580acd..7c75706cd 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -387,11 +387,12 @@ def _compute_result( ) # 2 - Apply MTC filter and select p values to MTC + cohort_size = sum(1 for _ in cohort) mtc_filter_results = self._mtc_filter.filter( gt_predicate=gt_predicate, ph_predicates=pheno_predicates, counts=all_counts, - cohort_size=len(cohort), + cohort_size=cohort_size, ) pvals = np.full(shape=(len(n_usable),), fill_value=np.nan) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 4ad56fb98..1bc6ff7c5 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -211,7 +211,7 @@ def description(self) -> str: @property def variable_name(self) -> str: - return "Allele groups" + return "Allele group" def test(self, patient: Patient) -> typing.Optional[Categorization]: self._check_patient(patient) @@ -357,13 +357,13 @@ def allele_count( >>> from gpsea.analysis.predicate.genotype import allele_count >>> zero_vs_one = allele_count(counts=({0,}, {1,})) >>> zero_vs_one.summarize_groups() - 'Allele counts: 0, 1' + 'Allele count: 0, 1' These counts will create three groups for individuals with zero, one or two alleles: >>> zero_vs_one_vs_two = allele_count(counts=({0,}, {1,}, {2,})) >>> zero_vs_one_vs_two.summarize_groups() - 'Allele counts: 0, 1, 2' + 'Allele count: 0, 1, 2' :param counts: a sequence with allele count partitions. :param target: a predicate for choosing the variants for testing @@ -415,7 +415,7 @@ def description(self) -> str: @property def variable_name(self) -> str: - return "Allele counts" + return "Allele count" def test(self, patient: Patient) -> typing.Optional[Categorization]: self._check_patient(patient) @@ -540,14 +540,6 @@ def create( # Last, put the predicate together. return DiagnosisPredicate(categorizations) - @property - def name(self) -> str: - return "Diagnosis Predicate" - - @property - def description(self) -> str: - return "Partition the individual by diagnosis" - def __init__( self, categorizations: typing.Mapping[hpotk.TermId, Categorization], @@ -558,6 +550,15 @@ def __init__( ) self._hash = hash(tuple(categorizations.items())) + @property + def name(self) -> str: + return "Diagnosis Predicate" + + @property + def description(self) -> str: + diagnoses = ", ".join(cat.category.name for cat in self._categorizations) + return f"Partition the individual by presence of {diagnoses}" + @property def variable_name(self) -> str: return "Diagnosis" diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 54067a0e7..b5503a642 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -152,11 +152,11 @@ def name(self) -> str: @property def description(self) -> str: - return f"Test for presence of {self._query_label}" + return f"Test for presence of {self._query_label} [{self._query.value}]" @property def variable_name(self) -> str: - return f"{self._query_label} is present" + return self._query.value @property def phenotype(self) -> hpotk.TermId: @@ -257,11 +257,11 @@ def name(self) -> str: @property def description(self) -> str: - return "Partition based on a diagnosis" + return f"Partition based on a diagnosis of {self._query.value}" @property def variable_name(self) -> str: - return f"{self._query.value} was diagnosed" + return self._query.value @property def phenotype(self) -> hpotk.TermId: diff --git a/src/gpsea/analysis/pscore/_api.py b/src/gpsea/analysis/pscore/_api.py index c05d1b8f7..61956ca99 100644 --- a/src/gpsea/analysis/pscore/_api.py +++ b/src/gpsea/analysis/pscore/_api.py @@ -7,7 +7,7 @@ from ..predicate.genotype import GenotypePolyPredicate from .stats import PhenotypeScoreStatistic -from .._base import MonoPhenotypeAnalysisResult +from .._base import MonoPhenotypeAnalysisResult, Statistic from .._partition import ContinuousPartitioning @@ -71,7 +71,7 @@ def description(self) -> str: @property def variable_name(self) -> str: - return "Score" + return "Phenotype score" def __init__( self, @@ -116,6 +116,26 @@ class PhenotypeScoreAnalysisResult(MonoPhenotypeAnalysisResult): if the phenotype score is impossible to compute. """ + def __init__( + self, + gt_predicate: GenotypePolyPredicate, + phenotype: PhenotypeScorer, + statistic: Statistic, + data: pd.DataFrame, + pval: float, + ): + super().__init__(gt_predicate, phenotype, statistic, data, pval) + assert isinstance(phenotype, PhenotypeScorer) + + def phenotype_scorer(self) -> PhenotypeScorer: + """ + Get the scorer that computed the phenotype score. + """ + # We are sure that `self._phenotype` is a `PhenotypeScorer` + # because of the instance check in `__init__` and `PhenotypeScorer` + # being a subclass of `Partitioning`. + return self._phenotype # type: ignore + def plot_boxplots( self, ax, @@ -168,10 +188,11 @@ def __hash__(self) -> int: def __str__(self) -> str: return ( "PhenotypeScoreAnalysisResult(" - "gt_predicate={self._gt_predicate}, " - "statistic={self._statistic}, " - "data={self._data}, " - "pval={self._pval})" + f"gt_predicate={self._gt_predicate}, " + f"phenotype_scorer={self._phenotype}, " + f"statistic={self._statistic}, " + f"data={self._data}, " + f"pval={self._pval})" ) def __repr__(self) -> str: @@ -213,6 +234,7 @@ def compare_genotype_vs_phenotype_score( assert ( gt_predicate.n_categorizations() == 2 ), "We only support 2 genotype categories at this point" + assert isinstance(pheno_scorer, PhenotypeScorer) idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id") data = pd.DataFrame( @@ -244,6 +266,7 @@ def compare_genotype_vs_phenotype_score( return PhenotypeScoreAnalysisResult( gt_predicate=gt_predicate, + phenotype=pheno_scorer, statistic=self._statistic, data=data, pval=pval, diff --git a/src/gpsea/analysis/pscore/_measurement.py b/src/gpsea/analysis/pscore/_measurement.py index 3930dfc28..104853de3 100644 --- a/src/gpsea/analysis/pscore/_measurement.py +++ b/src/gpsea/analysis/pscore/_measurement.py @@ -68,11 +68,19 @@ def name(self) -> str: @property def description(self) -> str: - return self.variable_name + return f"Value of {self._label} [{self._identifier.value}]" @property def variable_name(self) -> str: - return f"The value of {self._label} [{self._identifier.value}]" + return self._identifier.value + + @property + def term_id(self) -> hpotk.TermId: + return self._identifier + + @property + def label(self) -> str: + return self._label def score( self, diff --git a/src/gpsea/analysis/temporal/_api.py b/src/gpsea/analysis/temporal/_api.py index 8d654d23a..6a739c3bf 100644 --- a/src/gpsea/analysis/temporal/_api.py +++ b/src/gpsea/analysis/temporal/_api.py @@ -73,20 +73,22 @@ def __init__( ): super().__init__( gt_predicate=gt_predicate, + phenotype=endpoint, statistic=statistic, data=data, pval=pval, ) - assert isinstance(endpoint, Endpoint) - self._endpoint = endpoint @property def endpoint(self) -> Endpoint: """ Get the endpoint used to compute the survival of the individuals. """ - return self._endpoint + # We are sure that `self._phenotype` is assignable to `Endpoint` + # because of the instance check in `__init__` and `Endpoint` + # being a subclass of `Partitioning`. + return self._phenotype # type: ignore def plot_kaplan_meier_curves( self, @@ -125,10 +127,11 @@ def __hash__(self) -> int: def __str__(self) -> str: return ( "SurvivalAnalysisResult(" - "gt_predicate={self._gt_predicate}, " - "statistic={self._statistic}, " - "data={self._data}, " - "pval={self._pval})" + f"gt_predicate={self._gt_predicate}, " + f"endpoint={self._phenotype}, " + f"statistic={self._statistic}, " + f"data={self._data}, " + f"pval={self._pval})" ) def __repr__(self) -> str: diff --git a/src/gpsea/analysis/temporal/endpoint/_impl.py b/src/gpsea/analysis/temporal/endpoint/_impl.py index 8dadb8757..cffe0eabc 100644 --- a/src/gpsea/analysis/temporal/endpoint/_impl.py +++ b/src/gpsea/analysis/temporal/endpoint/_impl.py @@ -34,7 +34,7 @@ class Death(EndpointBase): @property def name(self) -> str: - return "Death" + return "Age of death" @property def description(self) -> str: @@ -111,7 +111,7 @@ def description(self) -> str: @property def variable_name(self) -> str: - return "Phenotype onset" + return f"Onset of {self._term_id.value}" def compute_survival( self, @@ -194,7 +194,7 @@ def description(self) -> str: @property def variable_name(self) -> str: - return "Disease onset" + return f"Onset of {self._disease_id.value}" def compute_survival( self, diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 5dc42d6a4..bf9957a6d 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -127,7 +127,7 @@ def test_eq_and_hash(self): def test_summarize_groups(self): a = allele_count(counts=((0, 1), (2,))) - assert a.summarize_groups() == "Allele counts: 0 OR 1, 2" + assert a.summarize_groups() == "Allele count: 0 OR 1, 2" class TestAllelePredicates: @@ -164,7 +164,7 @@ def test_monoallelic_predicate__general_stuff( gt_predicate = monoallelic_predicate(is_missense, is_synonymous) - assert gt_predicate.summarize_groups() == 'Allele groups: A, B' + assert gt_predicate.summarize_groups() == 'Allele group: A, B' @pytest.mark.parametrize( "individual_name,expected_name", @@ -199,7 +199,7 @@ def test_biallelic_predicate__general_stuff( gt_predicate = biallelic_predicate(is_missense, is_synonymous) - assert gt_predicate.summarize_groups() == 'Allele groups: A/A, A/B, B/B' + assert gt_predicate.summarize_groups() == 'Allele group: A/A, A/B, B/B' class TestSexPredicate: diff --git a/tests/analysis/pscore/test_pscore_api.py b/tests/analysis/pscore/test_pscore_api.py index 3c4fe4376..c3bfafe3d 100644 --- a/tests/analysis/pscore/test_pscore_api.py +++ b/tests/analysis/pscore/test_pscore_api.py @@ -1,18 +1,27 @@ +import random import pytest import pandas as pd from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.pscore import PhenotypeScoreAnalysisResult +from gpsea.analysis.pscore import PhenotypeScoreAnalysisResult, PhenotypeScorer from gpsea.analysis.pscore.stats import MannWhitneyStatistic class TestPhenotypeScoreAnalysisResult: + @pytest.fixture(scope="class") + def phenotype_scorer(self) -> PhenotypeScorer: + return PhenotypeScorer.wrap_scoring_function( + func=lambda patient: random.random(), + name="Random phenotype scorer", + ) + @pytest.fixture(scope="class") def result( self, suox_gt_predicate: GenotypePolyPredicate, + phenotype_scorer: PhenotypeScorer, ) -> PhenotypeScoreAnalysisResult: data = pd.DataFrame( data={ @@ -27,6 +36,7 @@ def result( ).set_index("patient_id") return PhenotypeScoreAnalysisResult( gt_predicate=suox_gt_predicate, + phenotype=phenotype_scorer, statistic=MannWhitneyStatistic(), data=data, pval=0.1234, diff --git a/tests/analysis/temporal/test_endpoint.py b/tests/analysis/temporal/test_endpoint.py index fad7d342d..d18a10a3b 100644 --- a/tests/analysis/temporal/test_endpoint.py +++ b/tests/analysis/temporal/test_endpoint.py @@ -76,7 +76,7 @@ def test_summarize(self): lines = endpoint.summary().splitlines() assert lines == [ - 'Death', + 'Age of death', 'Compute time until postnatal death', ]