Merge pull request #399 from monarch-initiative/report-partial-data-f…

…rom-survival-analysis Report partial data if survival statistics fails
monarch-initiative · Jan 16, 2025 · 5518116 · 5518116
2 parents d0f69d0 + 2721f33
commit 5518116
Show file tree

Hide file tree

Showing 5 changed files with 171 additions and 2 deletions.
diff --git a/docs/user-guide/analyses/survival.rst b/docs/user-guide/analyses/survival.rst
@@ -188,3 +188,96 @@ or `None` if computing the survival was impossible (see :func:`~gpsea.analysis.t
 The `Survival` reports the number of days until attaining the endpoint,
 here defined as end stage renal disease (`is_censored=False`),
 or until the individual dropped out of the analysis (`is_censored=True`).
+
+
+Troubleshooting
+===============
+
+Sometimes the survival analysis fails and an :class:`~gpsea.analysis.AnalysisException` is raised.
+For instance, the current Logrank test implementation reports a p value of `NaN`
+if the survival is the same for all individuals.
+This is unlikely an expected outcome, therefore GPSEA raises
+an :class:`~gpsea.analysis.AnalysisException` to force the user to troubleshoot.
+
+To help with troubleshooting, the data computed prior detecting the error is included in the exception's
+:attr:`~gpsea.analysis.AnalysisException.data` attribute. In survival analysis, the data should include
+the identifiers, genotype classes, and survivals of the tested individuals.
+
+Let's show this on an example. We will create a toy cohort of 10 individuals
+with onset of `Lynch syndrome I <https://hpo.jax.org/browse/disease/OMIM:120435>`_
+(`OMIM:120435`) at 40 years.
+
+>>> from gpsea.model import Cohort, Patient, Disease, Age
+>>> onset = Age.from_iso8601_period("P40Y")
+>>> individuals = [
+...     Patient.from_raw_parts(
+...         labels=label,
+...         diseases=(
+...             Disease.from_raw_parts(
+...                 term_id="OMIM:120435",
+...                 name="Lynch syndrome I",
+...                 is_observed=True,
+...                 onset=onset,
+...             ),
+...         ),
+...     )
+...     for label in "ABCDEFGHIJ"  # 10 individuals
+... ]
+>>> cohort = Cohort.from_patients(individuals)
+
+We will assign them into genotype classes on random, ...
+
+>>> from gpsea.analysis.clf import random_classifier
+>>> gt_clf = random_classifier(seed=123)
+>>> gt_clf.description
+'Classify the individual into random classes'
+
+... using the Lynch syndrome I diagnosis as the endpoint ...
+
+>>> from gpsea.analysis.temporal.endpoint import disease_onset
+>>> endpoint = disease_onset(disease_id="OMIM:120435")
+>>> endpoint.description
+'Compute time until OMIM:120435 onset'
+
+... and we will use Logrank test for differences in survival.
+
+>>> from gpsea.analysis.temporal.stats import LogRankTest
+>>> survival_statistic = LogRankTest()
+
+We put together the survival analysis ...
+
+>>> from gpsea.analysis.temporal import SurvivalAnalysis
+>>> survival_analysis = SurvivalAnalysis(
+...     statistic=survival_statistic,
+... )
+
+... which we expect to fail with an :class:`~gpsea.analysis.AnalysisException`:
+
+>>> result = survival_analysis.compare_genotype_vs_survival(
+...     cohort=cohort,
+...     gt_clf=gt_clf,
+...     endpoint=endpoint,
+... )
+Traceback (most recent call last):
+  ...
+gpsea.analysis._base.AnalysisException: The survival values did not meet the expectation of the statistical test!
+
+The genotype classes and survival values can be retrieved from the exception:
+
+>>> from gpsea.analysis import AnalysisException
+>>> try:
+...     result = survival_analysis.compare_genotype_vs_survival(
+...         cohort=cohort,
+...         gt_clf=gt_clf,
+...         endpoint=endpoint,
+...     )
+... except AnalysisException as ae:
+...     genotypes = ae.data["genotype"]
+...     survivals = ae.data["survival"]
+
+and the values can come in handy in troubleshooting:
+
+>>> genotypes[:3]
+(0, 0, 0)
+>>> survivals[:3]
+(Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False))
diff --git a/src/gpsea/analysis/_base.py b/src/gpsea/analysis/_base.py
@@ -395,6 +395,11 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta):
     that tested a single genotype-phenotype association.
     """
 
+    SAMPLE_ID = "patient_id"
+    """
+    Name of the data index.
+    """
+
     GT_COL = "genotype"
     """
     Name of column for storing genotype data.

diff --git a/src/gpsea/analysis/clf/__init__.py b/src/gpsea/analysis/clf/__init__.py
@@ -9,6 +9,7 @@
     monoallelic_classifier,
     biallelic_classifier,
     allele_count,
+    random_classifier,
 )
 from ._util import (
     prepare_classifiers_for_terms_of_interest,
@@ -28,6 +29,7 @@
     "monoallelic_classifier",
     "biallelic_classifier",
     "allele_count",
+    "random_classifier",
     "PhenotypeClassifier",
     "PhenotypeCategorization",
     "P",

diff --git a/src/gpsea/analysis/clf/_gt_classifiers.py b/src/gpsea/analysis/clf/_gt_classifiers.py
@@ -1,3 +1,4 @@
+import random
 import typing
 
 from collections import Counter
@@ -640,3 +641,62 @@ def diagnosis_classifier(
         diagnoses=diagnoses,
         labels=labels,
     )
+
+
+class RandomClassifier(GenotypeClassifier):
+
+    CATS = (
+        Categorization(
+            category=PatientCategory(
+                cat_id=0, name="A",
+            )
+        ),
+        Categorization(
+            category=PatientCategory(
+                cat_id=1, name="B",
+            )
+        )
+    )
+
+    def __init__(
+        self,
+        seed: typing.Optional[float] = None,
+    ):
+        self._rng = random.Random(x=seed)
+
+    @property
+    def name(self) -> str:
+        return "Random Classifier"
+
+    @property
+    def description(self) -> str:
+        return "Classify the individual into random classes"
+
+    @property
+    def variable_name(self) -> str:
+        return "Randomness"
+
+    def get_categorizations(self) -> typing.Sequence[Categorization]:
+        return RandomClassifier.CATS
+
+    def test(self, _: Patient) -> typing.Optional[Categorization]:
+        return self._rng.choice(RandomClassifier.CATS)
+
+    def __eq__(self, value: object) -> bool:
+        return isinstance(value, RandomClassifier) and self._rng == value._rng
+
+    def __hash__(self) -> int:
+        return hash((self._rng, ))
+
+
+def random_classifier(
+    seed: typing.Optional[float] = None,
+) -> GenotypeClassifier:
+    """
+    Genotype classifier to assign an individual into one of two classes, `A` and `B` on random..
+
+    :param seed: the seed for the random number generator.
+    """
+    return RandomClassifier(
+        seed=seed,
+    )
diff --git a/src/gpsea/analysis/temporal/_api.py b/src/gpsea/analysis/temporal/_api.py
@@ -168,7 +168,10 @@ def compare_genotype_vs_survival(
         Execute the survival analysis on a given `cohort`.
         """
 
-        idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id")
+        idx = pd.Index(
+            (patient.patient_id for patient in cohort),
+            name=MonoPhenotypeAnalysisResult.SAMPLE_ID,
+        )
         data = pd.DataFrame(
             None,
             index=idx,
@@ -194,8 +197,14 @@ def compare_genotype_vs_survival(
         vals = tuple(survivals[gt_cat] for gt_cat in gt_clf.get_categorizations())
         result = self._statistic.compute_pval(vals)
         if math.isnan(result.pval):
+            partial = {
+                MonoPhenotypeAnalysisResult.SAMPLE_ID: tuple(data.index),
+                "genotype": tuple(data[MonoPhenotypeAnalysisResult.GT_COL]),
+                "survival": tuple(data[MonoPhenotypeAnalysisResult.PH_COL]),
+            }
+
             raise AnalysisException(
-                dict(data=data),
+                partial,
                 "The survival values did not meet the expectation of the statistical test!",
             )