From 8ba618e3f71f599fd1629d19b104c307504c1fcd Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 3 Sep 2024 20:46:24 +0200
Subject: [PATCH] Improve docs for `filtering_predicate`.

---
 docs/user-guide/predicates.rst                | 58 +++++++++++
 .../analysis/predicate/genotype/__init__.py   |  4 +-
 src/gpsea/analysis/predicate/genotype/_api.py | 82 +---------------
 .../predicate/genotype/_gt_predicates.py      | 95 +++++++++++++++++++
 .../predicate/genotype/test_gt_predicates.py  |  9 +-
 .../predicate/genotype/test_predicates.py     |  1 +
 6 files changed, 163 insertions(+), 86 deletions(-)

diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst
index 6d95dfc5e..b224516f5 100644
--- a/docs/user-guide/predicates.rst
+++ b/docs/user-guide/predicates.rst
@@ -306,6 +306,64 @@ for assigning a patient into a genotype group:
 The `gt_predicate` can be used in downstream analysis, such as in :class:
 
 
+.. _filtering-predicate:
+
+Filtering predicate
+===================
+
+Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need
+to consider only a subset of the groups. A `GenotypePolyPredicate`
+created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset
+of the target categorizations of interest.
+
+Example
+-------
+
+Let's suppose we want test the genotype-phenotype association between variants
+that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`,
+and we are specifically interested in comparing the heterozygous variants
+in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous).
+
+First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`
+for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame:
+
+>>> from gpsea.model import VariantEffect
+>>> from gpsea.analysis.predicate.genotype import VariantPredicates
+>>> tx_id = 'NM_1234.5'
+>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \
+...     | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id)
+>>> is_frameshift_or_stop_gain.get_question()
+'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)'
+
+Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive`
+to bin according to a genotype group:
+
+>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
+>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain)
+>>> gt_predicate.display_question()
+'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT'
+
+We see that the `gt_predicate` bins the patients into three groups:
+
+>>> cats = gt_predicate.get_categorizations()
+>>> cats
+(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT))
+
+We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function,
+and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`
+that includes only the categories of interest:
+
+>>> from gpsea.analysis.predicate.genotype import filtering_predicate
+>>> fgt_predicate = filtering_predicate(
+...     predicate=gt_predicate,
+...     targets=(cats[1], cats[2]),
+... )
+>>> fgt_predicate.display_question()
+'Which genotype group does the patient fit in: HET, BIALLELIC_ALT'
+
+
+.. _groups-predicate:
+
 Groups predicate
 ================
 
diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py
index ff8d16e1b..86dd261a2 100644
--- a/src/gpsea/analysis/predicate/genotype/__init__.py
+++ b/src/gpsea/analysis/predicate/genotype/__init__.py
@@ -1,13 +1,13 @@
 from ._api import GenotypePolyPredicate
 from ._api import VariantPredicate
 from ._counter import AlleleCounter
-from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate
+from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, recessive_predicate
 from ._gt_predicates import ModeOfInheritancePredicate
 from ._variant import VariantPredicates, ProteinPredicates
 
 __all__ = [
     'GenotypePolyPredicate',
-    'boolean_predicate', 'groups_predicate', 'recessive_predicate',
+    'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'recessive_predicate',
     'ModeOfInheritancePredicate',
     'AlleleCounter', 'VariantPredicate',
     'VariantPredicates', 'ProteinPredicates',
diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py
index db29c573d..39cf020be 100644
--- a/src/gpsea/analysis/predicate/genotype/_api.py
+++ b/src/gpsea/analysis/predicate/genotype/_api.py
@@ -1,7 +1,7 @@
 import abc
 import typing
 
-from gpsea.model import Patient, Variant
+from gpsea.model import Variant
 from .._api import PolyPredicate, Categorization, PatientCategory
 
 
@@ -10,85 +10,7 @@ class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta
     `GenotypePolyPredicate` is a base class for all :class:`PolyPredicate`
     that test the genotype axis.
     """
-
-    @staticmethod
-    def filtering_predicate(
-        predicate: "GenotypePolyPredicate",
-        targets: typing.Collection[Categorization],
-    ) -> "GenotypePolyPredicate":
-        """
-        """
-        return FilteringGenotypePolyPredicate.create(
-            predicate=predicate,
-            targets=targets,
-        )
-
-
-class FilteringGenotypePolyPredicate(GenotypePolyPredicate):
-    # NOT PART OF THE PUBLIC API
-
-    @staticmethod
-    def create(
-        predicate: "GenotypePolyPredicate",
-        targets: typing.Collection[Categorization],
-    ) -> "FilteringGenotypePolyPredicate":
-        # At least 2 target categorizations must be provided
-        if len(targets) <= 1:
-            raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}')
-
-        good_boys = tuple(isinstance(cat, Categorization) for cat in targets)
-        if not all(good_boys):
-            offenders = ', '.join(
-                str(i)
-                for i, is_instance
-                in enumerate(good_boys) if not is_instance
-            )
-            raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]')
-
-        # All `allowed` categorizations must in fact be present in the `base` predicate.
-        cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets)
-        if not all(cats_are_in_fact_present):
-            missing = ', '.join(
-                c.category.name
-                for c, is_present
-                in zip(targets, cats_are_in_fact_present) if not is_present
-            )
-            raise ValueError(f'Some from the categories are not present: {missing}')
-        
-        if len(targets) == predicate.n_categorizations():
-            raise ValueError(
-                f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations '
-                f'with the same number ({len(targets)}) of targets'
-            )
-
-        return FilteringGenotypePolyPredicate(
-            predicate=predicate,
-            allowed=targets,
-        )
-
-    def __init__(
-        self,
-        predicate: "GenotypePolyPredicate",
-        allowed: typing.Iterable[Categorization],
-    ):
-        self._predicate = predicate
-        self._allowed = tuple(allowed)
-    
-    def get_categorizations(self) -> typing.Sequence[Categorization]:
-        return self._allowed
-
-    def get_question_base(self) -> str:
-        return self._predicate.get_question_base()
-
-    def test(self, patient: Patient) -> typing.Optional[Categorization]:
-        cat = self._predicate.test(patient)
-        if cat in self._allowed:
-            return cat
-        else:
-            return None
-
-    def __repr__(self):
-        return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})"
+    pass
 
 
 class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta):
diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index 4007e8d1b..43a39594a 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -151,6 +151,8 @@ def groups_predicate(
     The genotype groups *should* not overlap.
     In case of an overlap, the patient will be assigned into no group (`None`).
 
+    See the :ref:`groups-predicate` section for an example.
+
     :param predicates: an iterable with at least 2 variant predicates to determine a genotype group.
     :param group_names: an iterable with group names. The number of group names must match the number of predicates.
     """
@@ -182,6 +184,99 @@ def groups_predicate(
     )
 
 
+class FilteringGenotypePolyPredicate(GenotypePolyPredicate):
+    # NOT PART OF THE PUBLIC API
+
+    @staticmethod
+    def create(
+        predicate: "GenotypePolyPredicate",
+        targets: typing.Collection[Categorization],
+    ) -> "FilteringGenotypePolyPredicate":
+        # At least 2 target categorizations must be provided
+        if len(targets) <= 1:
+            raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}')
+
+        good_boys = tuple(isinstance(cat, Categorization) for cat in targets)
+        if not all(good_boys):
+            offenders = ', '.join(
+                str(i)
+                for i, is_instance
+                in enumerate(good_boys) if not is_instance
+            )
+            raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]')
+
+        # All `allowed` categorizations must in fact be present in the `base` predicate.
+        cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets)
+        if not all(cats_are_in_fact_present):
+            missing = ', '.join(
+                c.category.name
+                for c, is_present
+                in zip(targets, cats_are_in_fact_present) if not is_present
+            )
+            raise ValueError(f'Some from the categories are not present: {missing}')
+        
+        if len(targets) == predicate.n_categorizations():
+            raise ValueError(
+                f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations '
+                f'with the same number ({len(targets)}) of targets'
+            )
+
+        return FilteringGenotypePolyPredicate(
+            predicate=predicate,
+            allowed=targets,
+        )
+
+    def __init__(
+        self,
+        predicate: "GenotypePolyPredicate",
+        allowed: typing.Iterable[Categorization],
+    ):
+        self._predicate = predicate
+        self._allowed = tuple(allowed)
+    
+    def get_categorizations(self) -> typing.Sequence[Categorization]:
+        return self._allowed
+
+    def get_question_base(self) -> str:
+        return self._predicate.get_question_base()
+
+    def test(self, patient: Patient) -> typing.Optional[Categorization]:
+        cat = self._predicate.test(patient)
+        if cat in self._allowed:
+            return cat
+        else:
+            return None
+
+    def __repr__(self):
+        return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})"
+
+ 
+def filtering_predicate(
+    predicate: GenotypePolyPredicate,
+    targets: typing.Collection[Categorization],
+) -> GenotypePolyPredicate:
+    """
+    Filtering predicate applies the base `predicate` but only returns the categorizations
+    from the provided `targets` collection.
+
+    This can be useful if only some of the categorizations are interesting.
+    For instance, if we only seek to compare the differences between heterozygous and hemizygous variants,
+    but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups.
+
+    See the :ref:`filtering-predicate` section for an example.
+
+    The `predicate` is checked for being able to produce the all items in `targets`
+    and the `targets` must include at least 2 categorizations.
+
+    :param predicate: the base predicate whose categorizations are subject to filteration.
+    :param targets: the categorizations to retain
+    """
+    return FilteringGenotypePolyPredicate.create(
+        predicate=predicate,
+        targets=targets,
+    )
+
+
 class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate):
     # NOT PART OF THE PUBLIC API
     # TODO: this predicate is a bit weird and I think it should eventually go away.
diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
index 97d4fee06..67bc2d117 100644
--- a/tests/analysis/predicate/genotype/test_gt_predicates.py
+++ b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -6,6 +6,7 @@
 from gpsea.analysis.predicate.genotype import (
     GenotypePolyPredicate,
     groups_predicate,
+    filtering_predicate,
     VariantPredicates,
     VariantPredicate,
     ModeOfInheritancePredicate,
@@ -209,7 +210,7 @@ def test_filtering_predicate(
     ):
         cats = x_recessive_gt_predicate.get_categorizations()
         targets = [cats[i] for i in indices]
-        predicate = GenotypePolyPredicate.filtering_predicate(
+        predicate = filtering_predicate(
             predicate=x_recessive_gt_predicate,
             targets=targets,
         )
@@ -223,7 +224,7 @@ def test_filtering_predicate__explodes_when_not_subsetting(
         x_recessive_gt_predicate: GenotypePolyPredicate,
     ):
         with pytest.raises(ValueError) as ve:
-            GenotypePolyPredicate.filtering_predicate(
+            filtering_predicate(
                 predicate=x_recessive_gt_predicate,
                 targets=x_recessive_gt_predicate.get_categorizations(),
             )
@@ -238,7 +239,7 @@ def test_filtering_predicate__explodes_when_using_random_junk(
         x_recessive_gt_predicate: GenotypePolyPredicate,
     ):
         with pytest.raises(ValueError) as ve:
-            GenotypePolyPredicate.filtering_predicate(
+            filtering_predicate(
                 predicate=x_recessive_gt_predicate,
                 targets=(0, 1),
             )
@@ -253,7 +254,7 @@ def test_filtering_predicate__explodes_when_using_one_category(
         x_recessive_gt_predicate: GenotypePolyPredicate,
     ):
         with pytest.raises(ValueError) as ve:
-            GenotypePolyPredicate.filtering_predicate(
+            filtering_predicate(
                 predicate=x_recessive_gt_predicate,
                 targets=(x_recessive_gt_predicate.get_categorizations()[0],),
             )
diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py
index 2f78d7185..a076adacf 100644
--- a/tests/analysis/predicate/genotype/test_predicates.py
+++ b/tests/analysis/predicate/genotype/test_predicates.py
@@ -202,6 +202,7 @@ def test_protein_feature_id(
 
         assert predicate.test(missense_variant) == expected
 
+
 class TestLogicalVariantPredicate:
     """
     Test that the AND and OR variant predicate combinators work as expected.