From 8ba618e3f71f599fd1629d19b104c307504c1fcd Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 3 Sep 2024 20:46:24 +0200 Subject: [PATCH] Improve docs for `filtering_predicate`. --- docs/user-guide/predicates.rst | 58 +++++++++++ .../analysis/predicate/genotype/__init__.py | 4 +- src/gpsea/analysis/predicate/genotype/_api.py | 82 +--------------- .../predicate/genotype/_gt_predicates.py | 95 +++++++++++++++++++ .../predicate/genotype/test_gt_predicates.py | 9 +- .../predicate/genotype/test_predicates.py | 1 + 6 files changed, 163 insertions(+), 86 deletions(-) diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index 6d95dfc5e..b224516f5 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -306,6 +306,64 @@ for assigning a patient into a genotype group: The `gt_predicate` can be used in downstream analysis, such as in :class: +.. _filtering-predicate: + +Filtering predicate +=================== + +Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need +to consider only a subset of the groups. A `GenotypePolyPredicate` +created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset +of the target categorizations of interest. + +Example +------- + +Let's suppose we want test the genotype-phenotype association between variants +that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, +and we are specifically interested in comparing the heterozygous variants +in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). + +First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> tx_id = 'NM_1234.5' +>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ +... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) +>>> is_frameshift_or_stop_gain.get_question() +'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' + +Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +to bin according to a genotype group: + +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate.display_question() +'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' + +We see that the `gt_predicate` bins the patients into three groups: + +>>> cats = gt_predicate.get_categorizations() +>>> cats +(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) + +We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, +and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +that includes only the categories of interest: + +>>> from gpsea.analysis.predicate.genotype import filtering_predicate +>>> fgt_predicate = filtering_predicate( +... predicate=gt_predicate, +... targets=(cats[1], cats[2]), +... ) +>>> fgt_predicate.display_question() +'Which genotype group does the patient fit in: HET, BIALLELIC_ALT' + + +.. _groups-predicate: + Groups predicate ================ diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index ff8d16e1b..86dd261a2 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,13 +1,13 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate +from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, recessive_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'recessive_predicate', + 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'recessive_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py index db29c573d..39cf020be 100644 --- a/src/gpsea/analysis/predicate/genotype/_api.py +++ b/src/gpsea/analysis/predicate/genotype/_api.py @@ -1,7 +1,7 @@ import abc import typing -from gpsea.model import Patient, Variant +from gpsea.model import Variant from .._api import PolyPredicate, Categorization, PatientCategory @@ -10,85 +10,7 @@ class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta `GenotypePolyPredicate` is a base class for all :class:`PolyPredicate` that test the genotype axis. """ - - @staticmethod - def filtering_predicate( - predicate: "GenotypePolyPredicate", - targets: typing.Collection[Categorization], - ) -> "GenotypePolyPredicate": - """ - """ - return FilteringGenotypePolyPredicate.create( - predicate=predicate, - targets=targets, - ) - - -class FilteringGenotypePolyPredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API - - @staticmethod - def create( - predicate: "GenotypePolyPredicate", - targets: typing.Collection[Categorization], - ) -> "FilteringGenotypePolyPredicate": - # At least 2 target categorizations must be provided - if len(targets) <= 1: - raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}') - - good_boys = tuple(isinstance(cat, Categorization) for cat in targets) - if not all(good_boys): - offenders = ', '.join( - str(i) - for i, is_instance - in enumerate(good_boys) if not is_instance - ) - raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]') - - # All `allowed` categorizations must in fact be present in the `base` predicate. - cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets) - if not all(cats_are_in_fact_present): - missing = ', '.join( - c.category.name - for c, is_present - in zip(targets, cats_are_in_fact_present) if not is_present - ) - raise ValueError(f'Some from the categories are not present: {missing}') - - if len(targets) == predicate.n_categorizations(): - raise ValueError( - f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations ' - f'with the same number ({len(targets)}) of targets' - ) - - return FilteringGenotypePolyPredicate( - predicate=predicate, - allowed=targets, - ) - - def __init__( - self, - predicate: "GenotypePolyPredicate", - allowed: typing.Iterable[Categorization], - ): - self._predicate = predicate - self._allowed = tuple(allowed) - - def get_categorizations(self) -> typing.Sequence[Categorization]: - return self._allowed - - def get_question_base(self) -> str: - return self._predicate.get_question_base() - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - cat = self._predicate.test(patient) - if cat in self._allowed: - return cat - else: - return None - - def __repr__(self): - return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" + pass class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta): diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 4007e8d1b..43a39594a 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -151,6 +151,8 @@ def groups_predicate( The genotype groups *should* not overlap. In case of an overlap, the patient will be assigned into no group (`None`). + See the :ref:`groups-predicate` section for an example. + :param predicates: an iterable with at least 2 variant predicates to determine a genotype group. :param group_names: an iterable with group names. The number of group names must match the number of predicates. """ @@ -182,6 +184,99 @@ def groups_predicate( ) +class FilteringGenotypePolyPredicate(GenotypePolyPredicate): + # NOT PART OF THE PUBLIC API + + @staticmethod + def create( + predicate: "GenotypePolyPredicate", + targets: typing.Collection[Categorization], + ) -> "FilteringGenotypePolyPredicate": + # At least 2 target categorizations must be provided + if len(targets) <= 1: + raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}') + + good_boys = tuple(isinstance(cat, Categorization) for cat in targets) + if not all(good_boys): + offenders = ', '.join( + str(i) + for i, is_instance + in enumerate(good_boys) if not is_instance + ) + raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]') + + # All `allowed` categorizations must in fact be present in the `base` predicate. + cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets) + if not all(cats_are_in_fact_present): + missing = ', '.join( + c.category.name + for c, is_present + in zip(targets, cats_are_in_fact_present) if not is_present + ) + raise ValueError(f'Some from the categories are not present: {missing}') + + if len(targets) == predicate.n_categorizations(): + raise ValueError( + f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations ' + f'with the same number ({len(targets)}) of targets' + ) + + return FilteringGenotypePolyPredicate( + predicate=predicate, + allowed=targets, + ) + + def __init__( + self, + predicate: "GenotypePolyPredicate", + allowed: typing.Iterable[Categorization], + ): + self._predicate = predicate + self._allowed = tuple(allowed) + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return self._allowed + + def get_question_base(self) -> str: + return self._predicate.get_question_base() + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + cat = self._predicate.test(patient) + if cat in self._allowed: + return cat + else: + return None + + def __repr__(self): + return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" + + +def filtering_predicate( + predicate: GenotypePolyPredicate, + targets: typing.Collection[Categorization], +) -> GenotypePolyPredicate: + """ + Filtering predicate applies the base `predicate` but only returns the categorizations + from the provided `targets` collection. + + This can be useful if only some of the categorizations are interesting. + For instance, if we only seek to compare the differences between heterozygous and hemizygous variants, + but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups. + + See the :ref:`filtering-predicate` section for an example. + + The `predicate` is checked for being able to produce the all items in `targets` + and the `targets` must include at least 2 categorizations. + + :param predicate: the base predicate whose categorizations are subject to filteration. + :param targets: the categorizations to retain + """ + return FilteringGenotypePolyPredicate.create( + predicate=predicate, + targets=targets, + ) + + class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate): # NOT PART OF THE PUBLIC API # TODO: this predicate is a bit weird and I think it should eventually go away. diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 97d4fee06..67bc2d117 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -6,6 +6,7 @@ from gpsea.analysis.predicate.genotype import ( GenotypePolyPredicate, groups_predicate, + filtering_predicate, VariantPredicates, VariantPredicate, ModeOfInheritancePredicate, @@ -209,7 +210,7 @@ def test_filtering_predicate( ): cats = x_recessive_gt_predicate.get_categorizations() targets = [cats[i] for i in indices] - predicate = GenotypePolyPredicate.filtering_predicate( + predicate = filtering_predicate( predicate=x_recessive_gt_predicate, targets=targets, ) @@ -223,7 +224,7 @@ def test_filtering_predicate__explodes_when_not_subsetting( x_recessive_gt_predicate: GenotypePolyPredicate, ): with pytest.raises(ValueError) as ve: - GenotypePolyPredicate.filtering_predicate( + filtering_predicate( predicate=x_recessive_gt_predicate, targets=x_recessive_gt_predicate.get_categorizations(), ) @@ -238,7 +239,7 @@ def test_filtering_predicate__explodes_when_using_random_junk( x_recessive_gt_predicate: GenotypePolyPredicate, ): with pytest.raises(ValueError) as ve: - GenotypePolyPredicate.filtering_predicate( + filtering_predicate( predicate=x_recessive_gt_predicate, targets=(0, 1), ) @@ -253,7 +254,7 @@ def test_filtering_predicate__explodes_when_using_one_category( x_recessive_gt_predicate: GenotypePolyPredicate, ): with pytest.raises(ValueError) as ve: - GenotypePolyPredicate.filtering_predicate( + filtering_predicate( predicate=x_recessive_gt_predicate, targets=(x_recessive_gt_predicate.get_categorizations()[0],), ) diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py index 2f78d7185..a076adacf 100644 --- a/tests/analysis/predicate/genotype/test_predicates.py +++ b/tests/analysis/predicate/genotype/test_predicates.py @@ -202,6 +202,7 @@ def test_protein_feature_id( assert predicate.test(missense_variant) == expected + class TestLogicalVariantPredicate: """ Test that the AND and OR variant predicate combinators work as expected.