Skip to content

Commit

Permalink
Merge pull request #399 from monarch-initiative/report-partial-data-f…
Browse files Browse the repository at this point in the history
…rom-survival-analysis

Report partial data if survival statistics fails
  • Loading branch information
ielis authored Jan 16, 2025
2 parents d0f69d0 + 2721f33 commit 5518116
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 2 deletions.
93 changes: 93 additions & 0 deletions docs/user-guide/analyses/survival.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,96 @@ or `None` if computing the survival was impossible (see :func:`~gpsea.analysis.t
The `Survival` reports the number of days until attaining the endpoint,
here defined as end stage renal disease (`is_censored=False`),
or until the individual dropped out of the analysis (`is_censored=True`).


Troubleshooting
===============

Sometimes the survival analysis fails and an :class:`~gpsea.analysis.AnalysisException` is raised.
For instance, the current Logrank test implementation reports a p value of `NaN`
if the survival is the same for all individuals.
This is unlikely an expected outcome, therefore GPSEA raises
an :class:`~gpsea.analysis.AnalysisException` to force the user to troubleshoot.

To help with troubleshooting, the data computed prior detecting the error is included in the exception's
:attr:`~gpsea.analysis.AnalysisException.data` attribute. In survival analysis, the data should include
the identifiers, genotype classes, and survivals of the tested individuals.

Let's show this on an example. We will create a toy cohort of 10 individuals
with onset of `Lynch syndrome I <https://hpo.jax.org/browse/disease/OMIM:120435>`_
(`OMIM:120435`) at 40 years.

>>> from gpsea.model import Cohort, Patient, Disease, Age
>>> onset = Age.from_iso8601_period("P40Y")
>>> individuals = [
... Patient.from_raw_parts(
... labels=label,
... diseases=(
... Disease.from_raw_parts(
... term_id="OMIM:120435",
... name="Lynch syndrome I",
... is_observed=True,
... onset=onset,
... ),
... ),
... )
... for label in "ABCDEFGHIJ" # 10 individuals
... ]
>>> cohort = Cohort.from_patients(individuals)

We will assign them into genotype classes on random, ...

>>> from gpsea.analysis.clf import random_classifier
>>> gt_clf = random_classifier(seed=123)
>>> gt_clf.description
'Classify the individual into random classes'

... using the Lynch syndrome I diagnosis as the endpoint ...

>>> from gpsea.analysis.temporal.endpoint import disease_onset
>>> endpoint = disease_onset(disease_id="OMIM:120435")
>>> endpoint.description
'Compute time until OMIM:120435 onset'

... and we will use Logrank test for differences in survival.

>>> from gpsea.analysis.temporal.stats import LogRankTest
>>> survival_statistic = LogRankTest()

We put together the survival analysis ...

>>> from gpsea.analysis.temporal import SurvivalAnalysis
>>> survival_analysis = SurvivalAnalysis(
... statistic=survival_statistic,
... )

... which we expect to fail with an :class:`~gpsea.analysis.AnalysisException`:

>>> result = survival_analysis.compare_genotype_vs_survival(
... cohort=cohort,
... gt_clf=gt_clf,
... endpoint=endpoint,
... )
Traceback (most recent call last):
...
gpsea.analysis._base.AnalysisException: The survival values did not meet the expectation of the statistical test!

The genotype classes and survival values can be retrieved from the exception:

>>> from gpsea.analysis import AnalysisException
>>> try:
... result = survival_analysis.compare_genotype_vs_survival(
... cohort=cohort,
... gt_clf=gt_clf,
... endpoint=endpoint,
... )
... except AnalysisException as ae:
... genotypes = ae.data["genotype"]
... survivals = ae.data["survival"]

and the values can come in handy in troubleshooting:

>>> genotypes[:3]
(0, 0, 0)
>>> survivals[:3]
(Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False))
5 changes: 5 additions & 0 deletions src/gpsea/analysis/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,11 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta):
that tested a single genotype-phenotype association.
"""

SAMPLE_ID = "patient_id"
"""
Name of the data index.
"""

GT_COL = "genotype"
"""
Name of column for storing genotype data.
Expand Down
2 changes: 2 additions & 0 deletions src/gpsea/analysis/clf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
monoallelic_classifier,
biallelic_classifier,
allele_count,
random_classifier,
)
from ._util import (
prepare_classifiers_for_terms_of_interest,
Expand All @@ -28,6 +29,7 @@
"monoallelic_classifier",
"biallelic_classifier",
"allele_count",
"random_classifier",
"PhenotypeClassifier",
"PhenotypeCategorization",
"P",
Expand Down
60 changes: 60 additions & 0 deletions src/gpsea/analysis/clf/_gt_classifiers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import random
import typing

from collections import Counter
Expand Down Expand Up @@ -640,3 +641,62 @@ def diagnosis_classifier(
diagnoses=diagnoses,
labels=labels,
)


class RandomClassifier(GenotypeClassifier):

CATS = (
Categorization(
category=PatientCategory(
cat_id=0, name="A",
)
),
Categorization(
category=PatientCategory(
cat_id=1, name="B",
)
)
)

def __init__(
self,
seed: typing.Optional[float] = None,
):
self._rng = random.Random(x=seed)

@property
def name(self) -> str:
return "Random Classifier"

@property
def description(self) -> str:
return "Classify the individual into random classes"

@property
def variable_name(self) -> str:
return "Randomness"

def get_categorizations(self) -> typing.Sequence[Categorization]:
return RandomClassifier.CATS

def test(self, _: Patient) -> typing.Optional[Categorization]:
return self._rng.choice(RandomClassifier.CATS)

def __eq__(self, value: object) -> bool:
return isinstance(value, RandomClassifier) and self._rng == value._rng

def __hash__(self) -> int:
return hash((self._rng, ))


def random_classifier(
seed: typing.Optional[float] = None,
) -> GenotypeClassifier:
"""
Genotype classifier to assign an individual into one of two classes, `A` and `B` on random..
:param seed: the seed for the random number generator.
"""
return RandomClassifier(
seed=seed,
)
13 changes: 11 additions & 2 deletions src/gpsea/analysis/temporal/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,10 @@ def compare_genotype_vs_survival(
Execute the survival analysis on a given `cohort`.
"""

idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id")
idx = pd.Index(
(patient.patient_id for patient in cohort),
name=MonoPhenotypeAnalysisResult.SAMPLE_ID,
)
data = pd.DataFrame(
None,
index=idx,
Expand All @@ -194,8 +197,14 @@ def compare_genotype_vs_survival(
vals = tuple(survivals[gt_cat] for gt_cat in gt_clf.get_categorizations())
result = self._statistic.compute_pval(vals)
if math.isnan(result.pval):
partial = {
MonoPhenotypeAnalysisResult.SAMPLE_ID: tuple(data.index),
"genotype": tuple(data[MonoPhenotypeAnalysisResult.GT_COL]),
"survival": tuple(data[MonoPhenotypeAnalysisResult.PH_COL]),
}

raise AnalysisException(
dict(data=data),
partial,
"The survival values did not meet the expectation of the statistical test!",
)

Expand Down

0 comments on commit 5518116

Please sign in to comment.