From 53653e5d813197b19ecd0c2b2a8bf9f5a925daab Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 4 Sep 2024 14:13:19 +0200 Subject: [PATCH 1/2] Allow using `Sex` as a genotype predicate. --- .../analysis/predicate/genotype/__init__.py | 4 +- .../predicate/genotype/_gt_predicates.py | 46 ++++++++++++++++++- src/gpsea/model/_base.py | 2 +- .../predicate/genotype/test_gt_predicates.py | 34 +++++++++++++- 4 files changed, 81 insertions(+), 5 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index 82d2819d5..069cb9f6d 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,13 +1,13 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate +from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, sex_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'filtering_predicate', + 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'sex_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 8693b67bd..66a839ea7 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -71,7 +71,6 @@ def __repr__(self) -> str: return str(self) -# TODO: write AD, AR, XLR, XLD def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: """ Create a genotype boolean predicate from given `variant_predicate` @@ -657,3 +656,48 @@ def __str__(self) -> str: def __repr__(self) -> str: return str(self) + + +class SexGenotypePredicate(GenotypePolyPredicate): + # NOT PART OF THE PUBLIC API + + def __init__(self): + self._categorizations = ( + Categorization( + PatientCategory( + cat_id=0, name="FEMALE", description="Female", + ), + ), + Categorization( + PatientCategory( + cat_id=1, name="MALE", description="Male", + ), + ) + ) + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return self._categorizations + + def get_question_base(self) -> str: + return "Sex of the individual" + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + if patient.sex.is_provided(): + if patient.sex.is_female(): + return self._categorizations[0] + elif patient.sex.is_male(): + return self._categorizations[1] + else: + raise ValueError(f'Unsupported sex {patient.sex}') + else: + return None + + +INSTANCE = SexGenotypePredicate() + + +def sex_predicate() -> GenotypePolyPredicate: + """ + Get a genotype predicate for categorizing patients by their :class:`~gpsea.model.Sex`. + """ + return INSTANCE diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py index 09a1c913d..53b518027 100644 --- a/src/gpsea/model/_base.py +++ b/src/gpsea/model/_base.py @@ -42,7 +42,7 @@ def is_female(self) -> bool: """ Return `True` if the sex represents a `FEMALE`. """ - return self == Sex.MALE + return self == Sex.FEMALE def is_male(self) -> bool: """ diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 67bc2d117..61ff29c30 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -7,6 +7,7 @@ GenotypePolyPredicate, groups_predicate, filtering_predicate, + sex_predicate, VariantPredicates, VariantPredicate, ModeOfInheritancePredicate, @@ -185,7 +186,7 @@ def test_x_recessive( assert categorization.category.name == name -class TestPolyPredicate: +class TestFilteringPredicate: @pytest.fixture(scope="class") def x_recessive_gt_predicate(self) -> GenotypePolyPredicate: @@ -263,3 +264,34 @@ def test_filtering_predicate__explodes_when_using_one_category( ve.value.args[0] == "At least 2 target categorizations must be provided but got 1" ) + + +class TestSexPredicate: + + def test_sex_predicate( + self, + ): + joe = TestSexPredicate.make_patient('Joe', Sex.MALE) + jane = TestSexPredicate.make_patient('Jane', Sex.FEMALE) + miffy = TestSexPredicate.make_patient('Miffy', Sex.UNKNOWN_SEX) + + gt_predicate = sex_predicate() + female, male = gt_predicate.get_categorizations() + + assert gt_predicate.test(joe) == male + assert gt_predicate.test(jane) == female + assert gt_predicate.test(miffy) is None + + def test_get_question(self): + gt_predicate = sex_predicate() + assert gt_predicate.display_question() == 'Sex of the individual: FEMALE, MALE' + + @staticmethod + def make_patient(label: str, sex: Sex) -> Patient: + return Patient( + SampleLabels(label), + sex, + phenotypes=(), + diseases=(), + variants=(), + ) From c61b087254e1e4721507ad9f3c8f28206c6171d2 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 4 Sep 2024 14:53:14 +0200 Subject: [PATCH 2/2] Allow using disease diagnosis as a genotype predicate. --- docs/user-guide/predicates.rst | 39 ++++ .../analysis/predicate/genotype/__init__.py | 4 +- .../predicate/genotype/_gt_predicates.py | 199 +++++++++++++++--- 3 files changed, 205 insertions(+), 37 deletions(-) diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index b224516f5..785c3ef0c 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -399,6 +399,45 @@ for testing if the individual has at least one missense vs. frameshift vs. synon 'Genotype group: Missense, Frameshift, Synonymous' +.. _sex-predicate: + +Partition by the sex of the individual +====================================== + +It is easy to investigate the phenotypic differences between females and males. +The :func:`~gpsea.analysis.predicate.genotype.sex_predicate` provides a predicate +for partitioning based on the sex of the individual: + +>>> from gpsea.analysis.predicate.genotype import sex_predicate +>>> gt_predicate = sex_predicate() +>>> gt_predicate.display_question() +'Sex of the individual: FEMALE, MALE' + +The individuals with :class:`~gpsea.model.Sex.UNKNOWN_SEX` will be omitted from the analysis. + + +.. _diagnosis-predicate: + +Partition by a diagnosis +======================== + +It is also possible to bin the individuals based on a diagnosis. +The :func:`~gpsea.analysis.predicate.genotype.diagnosis_predicate` +prepares a genotype predicate for assigning an individual into a diagnosis group: + +>>> from gpsea.analysis.predicate.genotype import diagnosis_predicate +>>> gt_predicate = diagnosis_predicate( +... diagnoses=('OMIM:154700', 'OMIM:129600'), +... labels=('Marfan syndrome', 'Ectopia lentis, familial'), +... ) +>>> gt_predicate.display_question() +'What disease was diagnosed: OMIM:154700, OMIM:129600' + +Note, an individual must match only one diagnosis group. Any individuals labeled with two or more diagnoses +(e.g. an individual with both *Marfan syndrome* and *Ectopia lentis, familial*) +will be automatically omitted from the analysis. + + .. _phenotype-predicates: ******************** diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index 069cb9f6d..23ad8cfbb 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,13 +1,13 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, sex_predicate +from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, sex_predicate, diagnosis_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'sex_predicate', + 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'sex_predicate', 'diagnosis_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 66a839ea7..3a9301273 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,9 +1,12 @@ +from cProfile import label import dataclasses import enum import typing from collections import defaultdict +import hpotk + from gpsea.model import Patient, Sex from .._api import Categorization, PatientCategory, PatientCategories @@ -193,31 +196,35 @@ def create( ) -> "FilteringGenotypePolyPredicate": # At least 2 target categorizations must be provided if len(targets) <= 1: - raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}') + raise ValueError( + f"At least 2 target categorizations must be provided but got {len(targets)}" + ) good_boys = tuple(isinstance(cat, Categorization) for cat in targets) if not all(good_boys): - offenders = ', '.join( - str(i) - for i, is_instance - in enumerate(good_boys) if not is_instance + offenders = ", ".join( + str(i) for i, is_instance in enumerate(good_boys) if not is_instance + ) + raise ValueError( + f"The targets at following indices are not categorizations: [{offenders}]" ) - raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]') # All `allowed` categorizations must in fact be present in the `base` predicate. - cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets) + cats_are_in_fact_present = tuple( + cat in predicate.get_categorizations() for cat in targets + ) if not all(cats_are_in_fact_present): - missing = ', '.join( + missing = ", ".join( c.category.name - for c, is_present - in zip(targets, cats_are_in_fact_present) if not is_present + for c, is_present in zip(targets, cats_are_in_fact_present) + if not is_present ) - raise ValueError(f'Some from the categories are not present: {missing}') - + raise ValueError(f"Some from the categories are not present: {missing}") + if len(targets) == predicate.n_categorizations(): raise ValueError( - f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations ' - f'with the same number ({len(targets)}) of targets' + f"It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations " + f"with the same number ({len(targets)}) of targets" ) return FilteringGenotypePolyPredicate( @@ -232,7 +239,7 @@ def __init__( ): self._predicate = predicate self._allowed = tuple(allowed) - + def get_categorizations(self) -> typing.Sequence[Categorization]: return self._allowed @@ -249,7 +256,7 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: def __repr__(self): return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" - + def filtering_predicate( predicate: GenotypePolyPredicate, targets: typing.Collection[Categorization], @@ -288,12 +295,12 @@ class MendelianInheritanceAspect(enum.Enum): """ Related to chromosomes that do *not* determine the sex of an individual. """ - + GONOSOMAL = 1 """ Related to chromosomes that determine the sex of an individual. """ - + MITOCHONDRIAL = 2 """ Related to mitochondrial DNA. @@ -306,23 +313,30 @@ class ModeOfInheritanceInfo: HOM_REF = Categorization( PatientCategory( - cat_id=0, name="HOM_REF", description="Homozygous reference", + cat_id=0, + name="HOM_REF", + description="Homozygous reference", ), ) HET = Categorization( PatientCategory( - cat_id=1, name="HET", description="Heterozygous", + cat_id=1, + name="HET", + description="Heterozygous", ), ) BIALLELIC_ALT = Categorization( PatientCategory( - cat_id=2, name="BIALLELIC_ALT", + cat_id=2, + name="BIALLELIC_ALT", description="Homozygous alternate or compound heterozygous", ), ) HEMI = Categorization( PatientCategory( - cat_id=3, name="HEMI", description="Hemizygous", + cat_id=3, + name="HEMI", + description="Hemizygous", ), ) @@ -438,7 +452,7 @@ def __init__( assert isinstance(group, GenotypeGroup) self._groups[group.allele_count].append(group) hash_value += 13 * hash(group) - + self._hash = hash_value @property @@ -507,7 +521,7 @@ def autosomal_dominant( variant_predicate=variant_predicate, mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), ) - + @staticmethod def autosomal_recessive( variant_predicate: VariantPredicate, @@ -524,7 +538,7 @@ def autosomal_recessive( variant_predicate=variant_predicate, mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), ) - + @staticmethod def x_dominant( variant_predicate: VariantPredicate, @@ -533,7 +547,7 @@ def x_dominant( Create a predicate that assigns the patient either into homozygous reference or heterozygous group in line with the X-linked dominant mode of inheritance. - + :param variant_predicate: a predicate for choosing the variants for testing. """ return ModeOfInheritancePredicate.from_moi_info( @@ -583,11 +597,15 @@ def __init__( assert isinstance(mode_of_inheritance_info, ModeOfInheritanceInfo) self._moi_info = mode_of_inheritance_info - self._categorizations = tuple(group.categorization for group in mode_of_inheritance_info.groups) - issues = ModeOfInheritancePredicate._check_categorizations(self._categorizations) + self._categorizations = tuple( + group.categorization for group in mode_of_inheritance_info.groups + ) + issues = ModeOfInheritancePredicate._check_categorizations( + self._categorizations + ) if issues: - raise ValueError('Cannot create predicate: {}'.format(', '.join(issues))) - self._question = 'Which genotype group does the patient fit in' + raise ValueError("Cannot create predicate: {}".format(", ".join(issues))) + self._question = "Which genotype group does the patient fit in" def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -645,7 +663,12 @@ def __eq__(self, value: object) -> bool: ) def __hash__(self) -> int: - return hash((self._allele_counter, self._moi_info,)) + return hash( + ( + self._allele_counter, + self._moi_info, + ) + ) def __str__(self) -> str: return ( @@ -665,14 +688,18 @@ def __init__(self): self._categorizations = ( Categorization( PatientCategory( - cat_id=0, name="FEMALE", description="Female", + cat_id=0, + name="FEMALE", + description="Female", ), ), Categorization( PatientCategory( - cat_id=1, name="MALE", description="Male", + cat_id=1, + name="MALE", + description="Male", ), - ) + ), ) def get_categorizations(self) -> typing.Sequence[Categorization]: @@ -688,7 +715,7 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: elif patient.sex.is_male(): return self._categorizations[1] else: - raise ValueError(f'Unsupported sex {patient.sex}') + raise ValueError(f"Unsupported sex {patient.sex}") else: return None @@ -699,5 +726,107 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: def sex_predicate() -> GenotypePolyPredicate: """ Get a genotype predicate for categorizing patients by their :class:`~gpsea.model.Sex`. + + See the :ref:`sex-predicate` section for an example. """ return INSTANCE + + +class DiagnosisPredicate(GenotypePolyPredicate): + + @staticmethod + def create( + diagnoses: typing.Iterable[typing.Union[str, hpotk.TermId]], + labels: typing.Optional[typing.Iterable[str]] = None, + ) -> "DiagnosisPredicate": + # First, collect the iterables and check sanity. + diagnosis_ids = [] + for d in diagnoses: + if isinstance(d, str): + d = hpotk.TermId.from_curie(d) + elif isinstance(d, hpotk.TermId): + pass + else: + raise ValueError(f"{d} is neither `str` nor `hpotk.TermId`") + + diagnosis_ids.append(d) + + if labels is None: + labels = tuple(d.value for d in diagnosis_ids) + else: + labels = tuple(labels) + + assert (len(diagnosis_ids) >= 2), \ + f"We need at least 2 diagnoses: {len(diagnosis_ids)}" + assert len(diagnosis_ids) == len(labels), \ + f"The number of labels must match the number of diagnose IDs: {len(diagnosis_ids)}!={len(labels)}" + + # Then, prepare the categorizations. + categorizations = { + diagnosis_id: Categorization.from_raw_parts( + cat_id=i, + name=diagnosis_id.value, + description=label, + ) + for i, (diagnosis_id, label) in enumerate(zip(diagnosis_ids, labels)) + } + + # Last, put the predicate together. + return DiagnosisPredicate(categorizations) + + def __init__( + self, + categorizations: typing.Mapping[hpotk.TermId, Categorization], + ): + self._id2cat = dict(categorizations) + self._categorizations = tuple( + sorted(categorizations.values(), key=lambda c: c.category.cat_id) + ) + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return self._categorizations + + def get_question_base(self) -> str: + return 'What disease was diagnosed' + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + categorization = None + for disease in patient.diseases: + try: + candidate = self._id2cat[disease.identifier] + except KeyError: + # No match for this disease, no problem. + continue + + if categorization is None: + # First time we found a candidate disease + categorization = candidate + else: + # Ambiguous match. We found several matching diagnoses! + return None + + return categorization + + +def diagnosis_predicate( + diagnoses: typing.Iterable[typing.Union[str, hpotk.TermId]], + labels: typing.Optional[typing.Iterable[str]] = None, +) -> GenotypePolyPredicate: + """ + Create a genotype predicate that bins the patient based on presence of a disease diagnosis, + as listed in :attr:`~gpsea.model.Patient.diseases` attribute. + + If an individual is diagnosed with more than one disease from the provided `diagnoses`, + the individual will be assigned into no group (`None`). + + See the :ref:`diagnosis-predicate` section for an example. + + :param diagnoses: an iterable with at least 2 diagnose IDs, either as a `str` or a :class:`~hpotk.TermId` + to determine the genotype group. + :param labels: an iterable with diagnose names or `None` if CURIEs should be used instead. + The number of labels must match the number of predicates. + """ + return DiagnosisPredicate.create( + diagnoses=diagnoses, + labels=labels, + )