Merge pull request #336 from monarch-initiative/report-the-analysis-c…

…omponents Expose more analysis components
P2GX · Oct 29, 2024 · f20af85 · f20af85
2 parents e1f7ab3 + 7390305
commit f20af85
Show file tree

Hide file tree

Showing 14 changed files with 103 additions and 43 deletions.
diff --git a/docs/user-guide/analyses/measurements.rst b/docs/user-guide/analyses/measurements.rst
@@ -87,7 +87,7 @@ We use the measurement of `Testosterone [Mass/volume] in Serum or Plasma <https:
 ...     label="Testosterone [Mass/volume] in Serum or Plasma",
 ... )
 >>> pheno_scorer.description
-'The value of Testosterone [Mass/volume] in Serum or Plasma [LOINC:2986-8]'
+'Value of Testosterone [Mass/volume] in Serum or Plasma [LOINC:2986-8]'
 
 
 Statistical test

diff --git a/docs/user-guide/predicates/hpo_predicate.rst b/docs/user-guide/predicates/hpo_predicate.rst
@@ -39,7 +39,7 @@ and now we can set up a predicate to test for presence of *Abnormal lens morphol
 >>> pheno_predicate.name
 'HPO Predicate'
 >>> pheno_predicate.description
-'Test for presence of Abnormal lens morphology'
+'Test for presence of Abnormal lens morphology [HP:0000517]'
 >>> pheno_predicate.group_labels
 ('Yes', 'No')
 

diff --git a/src/gpsea/analysis/_base.py b/src/gpsea/analysis/_base.py
@@ -7,6 +7,7 @@
 
 from .predicate.phenotype import PhenotypePolyPredicate, P
 from .predicate.genotype import GenotypePolyPredicate
+from ._partition import Partitioning
 
 
 class Statistic(metaclass=abc.ABCMeta):
@@ -266,12 +267,16 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta):
     def __init__(
         self,
         gt_predicate: GenotypePolyPredicate,
+        phenotype: Partitioning,
         statistic: Statistic,
         data: pd.DataFrame,
         pval: float,
     ):
         super().__init__(gt_predicate, statistic)
 
+        assert isinstance(phenotype, Partitioning)
+        self._phenotype = phenotype
+
         assert isinstance(data, pd.DataFrame) and all(
             col in data.columns for col in MonoPhenotypeAnalysisResult.DATA_COLUMNS
         )
@@ -283,6 +288,13 @@ def __init__(
             raise ValueError(
                 f"`pval` must be a finite float in range [0, 1] but it was {pval}"
             )
+
+    @property
+    def phenotype(self) -> Partitioning:
+        """
+        Get the :class:`~gpsea.analysis.Partitioning` that produced the phenotype.
+        """
+        return self._phenotype
 
     @property
     def data(self) -> pd.DataFrame:
@@ -325,12 +337,14 @@ def pval(self) -> float:
     def __eq__(self, value: object) -> bool:
         return isinstance(value, MonoPhenotypeAnalysisResult) \
             and super(AnalysisResult, self).__eq__(value) \
+            and self._phenotype == value._phenotype \
             and self._pval == value._pval \
             and self._data.equals(value._data)
 
     def __hash__(self) -> int:
         return hash((
             super(AnalysisResult, self).__hash__(),
+            self._phenotype,
             self._pval,
             self._data,
         ))
diff --git a/src/gpsea/analysis/_partition.py b/src/gpsea/analysis/_partition.py
@@ -35,7 +35,7 @@ def variable_name(self) -> str:
         """
         Get a `str` with the name of the variable investigated by the partitioning.
 
-        For instance `Sex`, `Allele groups`, `HPO term`, `Diagnosis`
+        For instance `Sex`, `Allele groups`, `HP:0001250`, `OMIM:256000`
         """
         pass
 

diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
@@ -387,11 +387,12 @@ def _compute_result(
         )
 
         # 2 - Apply MTC filter and select p values to MTC
+        cohort_size = sum(1 for _ in cohort)
         mtc_filter_results = self._mtc_filter.filter(
             gt_predicate=gt_predicate,
             ph_predicates=pheno_predicates,
             counts=all_counts,
-            cohort_size=len(cohort),
+            cohort_size=cohort_size,
         )
 
         pvals = np.full(shape=(len(n_usable),), fill_value=np.nan)

diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -211,7 +211,7 @@ def description(self) -> str:
 
     @property
     def variable_name(self) -> str:
-        return "Allele groups"
+        return "Allele group"
 
     def test(self, patient: Patient) -> typing.Optional[Categorization]:
         self._check_patient(patient)
@@ -357,13 +357,13 @@ def allele_count(
     >>> from gpsea.analysis.predicate.genotype import allele_count
     >>> zero_vs_one = allele_count(counts=({0,}, {1,}))
     >>> zero_vs_one.summarize_groups()
-    'Allele counts: 0, 1'
+    'Allele count: 0, 1'
     
     These counts will create three groups for individuals with zero, one or two alleles:
 
     >>> zero_vs_one_vs_two = allele_count(counts=({0,}, {1,}, {2,}))
     >>> zero_vs_one_vs_two.summarize_groups()
-    'Allele counts: 0, 1, 2'
+    'Allele count: 0, 1, 2'
 
     :param counts: a sequence with allele count partitions.
     :param target: a predicate for choosing the variants for testing
@@ -415,7 +415,7 @@ def description(self) -> str:
 
     @property
     def variable_name(self) -> str:
-        return "Allele counts"
+        return "Allele count"
 
     def test(self, patient: Patient) -> typing.Optional[Categorization]:
         self._check_patient(patient)
@@ -540,14 +540,6 @@ def create(
         # Last, put the predicate together.
         return DiagnosisPredicate(categorizations)
 
-    @property
-    def name(self) -> str:
-        return "Diagnosis Predicate"
-
-    @property
-    def description(self) -> str:
-        return "Partition the individual by diagnosis"
-
     def __init__(
         self,
         categorizations: typing.Mapping[hpotk.TermId, Categorization],
@@ -558,6 +550,15 @@ def __init__(
         )
         self._hash = hash(tuple(categorizations.items()))
 
+    @property
+    def name(self) -> str:
+        return "Diagnosis Predicate"
+
+    @property
+    def description(self) -> str:
+        diagnoses = ", ".join(cat.category.name for cat in self._categorizations)
+        return f"Partition the individual by presence of {diagnoses}"
+
     @property
     def variable_name(self) -> str:
         return "Diagnosis"

diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py
@@ -152,11 +152,11 @@ def name(self) -> str:
 
     @property
     def description(self) -> str:
-        return f"Test for presence of {self._query_label}"
+        return f"Test for presence of {self._query_label} [{self._query.value}]"
 
     @property
     def variable_name(self) -> str:
-        return f"{self._query_label} is present"
+        return self._query.value
 
     @property
     def phenotype(self) -> hpotk.TermId:
@@ -257,11 +257,11 @@ def name(self) -> str:
 
     @property
     def description(self) -> str:
-        return "Partition based on a diagnosis"
+        return f"Partition based on a diagnosis of {self._query.value}"
 
     @property
     def variable_name(self) -> str:
-        return f"{self._query.value} was diagnosed"
+        return self._query.value
 
     @property
     def phenotype(self) -> hpotk.TermId:

diff --git a/src/gpsea/analysis/pscore/_api.py b/src/gpsea/analysis/pscore/_api.py
@@ -7,7 +7,7 @@
 from ..predicate.genotype import GenotypePolyPredicate
 from .stats import PhenotypeScoreStatistic
 
-from .._base import MonoPhenotypeAnalysisResult
+from .._base import MonoPhenotypeAnalysisResult, Statistic
 from .._partition import ContinuousPartitioning
 
 
@@ -71,7 +71,7 @@ def description(self) -> str:
 
     @property
     def variable_name(self) -> str:
-        return "Score"
+        return "Phenotype score"
 
     def __init__(
         self,
@@ -116,6 +116,26 @@ class PhenotypeScoreAnalysisResult(MonoPhenotypeAnalysisResult):
     if the phenotype score is impossible to compute.
     """
 
+    def __init__(
+        self,
+        gt_predicate: GenotypePolyPredicate,
+        phenotype: PhenotypeScorer,
+        statistic: Statistic,
+        data: pd.DataFrame,
+        pval: float,
+    ):
+        super().__init__(gt_predicate, phenotype, statistic, data, pval)
+        assert isinstance(phenotype, PhenotypeScorer)
+
+    def phenotype_scorer(self) -> PhenotypeScorer:
+        """
+        Get the scorer that computed the phenotype score.
+        """
+        # We are sure that `self._phenotype` is a `PhenotypeScorer`
+        # because of the instance check in `__init__` and `PhenotypeScorer`
+        # being a subclass of `Partitioning`.
+        return self._phenotype  # type: ignore
+
     def plot_boxplots(
         self,
         ax,
@@ -168,10 +188,11 @@ def __hash__(self) -> int:
     def __str__(self) -> str:
         return (
             "PhenotypeScoreAnalysisResult("
-            "gt_predicate={self._gt_predicate}, "
-            "statistic={self._statistic}, "
-            "data={self._data}, "
-            "pval={self._pval})"
+            f"gt_predicate={self._gt_predicate}, "
+            f"phenotype_scorer={self._phenotype}, "
+            f"statistic={self._statistic}, "
+            f"data={self._data}, "
+            f"pval={self._pval})"
         )
 
     def __repr__(self) -> str:
@@ -213,6 +234,7 @@ def compare_genotype_vs_phenotype_score(
         assert (
             gt_predicate.n_categorizations() == 2
         ), "We only support 2 genotype categories at this point"
+        assert isinstance(pheno_scorer, PhenotypeScorer)
 
         idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id")
         data = pd.DataFrame(
@@ -244,6 +266,7 @@ def compare_genotype_vs_phenotype_score(
 
         return PhenotypeScoreAnalysisResult(
             gt_predicate=gt_predicate,
+            phenotype=pheno_scorer,
             statistic=self._statistic,
             data=data,
             pval=pval,

diff --git a/src/gpsea/analysis/pscore/_measurement.py b/src/gpsea/analysis/pscore/_measurement.py
@@ -68,11 +68,19 @@ def name(self) -> str:
 
     @property
     def description(self) -> str:
-        return self.variable_name
+        return f"Value of {self._label} [{self._identifier.value}]"
 
     @property
     def variable_name(self) -> str:
-        return f"The value of {self._label} [{self._identifier.value}]"
+        return self._identifier.value
+
+    @property
+    def term_id(self) -> hpotk.TermId:
+        return self._identifier
+
+    @property
+    def label(self) -> str:
+        return self._label
 
     def score(
         self,

diff --git a/src/gpsea/analysis/temporal/_api.py b/src/gpsea/analysis/temporal/_api.py
@@ -73,20 +73,22 @@ def __init__(
     ):
         super().__init__(
             gt_predicate=gt_predicate,
+            phenotype=endpoint,
             statistic=statistic,
             data=data,
             pval=pval,
         )
-
         assert isinstance(endpoint, Endpoint)
-        self._endpoint = endpoint
 
     @property
     def endpoint(self) -> Endpoint:
         """
         Get the endpoint used to compute the survival of the individuals.
         """
-        return self._endpoint
+        # We are sure that `self._phenotype` is assignable to `Endpoint`
+        # because of the instance check in `__init__` and `Endpoint`
+        # being a subclass of `Partitioning`.
+        return self._phenotype  # type: ignore
 
     def plot_kaplan_meier_curves(
         self,
@@ -125,10 +127,11 @@ def __hash__(self) -> int:
     def __str__(self) -> str:
         return (
             "SurvivalAnalysisResult("
-            "gt_predicate={self._gt_predicate}, "
-            "statistic={self._statistic}, "
-            "data={self._data}, "
-            "pval={self._pval})"
+            f"gt_predicate={self._gt_predicate}, "
+            f"endpoint={self._phenotype}, "
+            f"statistic={self._statistic}, "
+            f"data={self._data}, "
+            f"pval={self._pval})"
         )
 
     def __repr__(self) -> str:

diff --git a/src/gpsea/analysis/temporal/endpoint/_impl.py b/src/gpsea/analysis/temporal/endpoint/_impl.py
@@ -34,7 +34,7 @@ class Death(EndpointBase):
 
     @property
     def name(self) -> str:
-        return "Death"
+        return "Age of death"
 
     @property
     def description(self) -> str:
@@ -111,7 +111,7 @@ def description(self) -> str:
 
     @property
     def variable_name(self) -> str:
-        return "Phenotype onset"
+        return f"Onset of {self._term_id.value}"
 
     def compute_survival(
         self,
@@ -194,7 +194,7 @@ def description(self) -> str:
 
     @property
     def variable_name(self) -> str:
-        return "Disease onset"
+        return f"Onset of {self._disease_id.value}"
 
     def compute_survival(
         self,

diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -127,7 +127,7 @@ def test_eq_and_hash(self):
     def test_summarize_groups(self):
         a = allele_count(counts=((0, 1), (2,)))
 
-        assert a.summarize_groups() == "Allele counts: 0 OR 1, 2"
+        assert a.summarize_groups() == "Allele count: 0 OR 1, 2"
 
 
 class TestAllelePredicates:
@@ -164,7 +164,7 @@ def test_monoallelic_predicate__general_stuff(
 
         gt_predicate = monoallelic_predicate(is_missense, is_synonymous)
 
-        assert gt_predicate.summarize_groups() == 'Allele groups: A, B'
+        assert gt_predicate.summarize_groups() == 'Allele group: A, B'
 
     @pytest.mark.parametrize(
         "individual_name,expected_name",
@@ -199,7 +199,7 @@ def test_biallelic_predicate__general_stuff(
 
         gt_predicate = biallelic_predicate(is_missense, is_synonymous)
 
-        assert gt_predicate.summarize_groups() == 'Allele groups: A/A, A/B, B/B'
+        assert gt_predicate.summarize_groups() == 'Allele group: A/A, A/B, B/B'
 
 
 class TestSexPredicate:
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,7 +35,7 @@ def variable_name(self) -> str: @@
             """
             Get a `str` with the name of the variable investigated by the partitioning.
-            For instance `Sex`, `Allele groups`, `HPO term`, `Diagnosis`
+            For instance `Sex`, `Allele groups`, `HP:0001250`, `OMIM:256000`
             """
             pass
@@ Expand Down @@