Skip to content

Commit

Permalink
Improve docs for filtering_predicate.
Browse files Browse the repository at this point in the history
  • Loading branch information
ielis committed Sep 3, 2024
1 parent 21caddc commit 8ba618e
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 86 deletions.
58 changes: 58 additions & 0 deletions docs/user-guide/predicates.rst
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,64 @@ for assigning a patient into a genotype group:
The `gt_predicate` can be used in downstream analysis, such as in :class:


.. _filtering-predicate:

Filtering predicate
===================

Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need
to consider only a subset of the groups. A `GenotypePolyPredicate`
created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset
of the target categorizations of interest.

Example
-------

Let's suppose we want test the genotype-phenotype association between variants
that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`,
and we are specifically interested in comparing the heterozygous variants
in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous).

First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`
for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame:

>>> from gpsea.model import VariantEffect
>>> from gpsea.analysis.predicate.genotype import VariantPredicates
>>> tx_id = 'NM_1234.5'
>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \
... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id)
>>> is_frameshift_or_stop_gain.get_question()
'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)'

Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive`
to bin according to a genotype group:

>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain)
>>> gt_predicate.display_question()
'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT'

We see that the `gt_predicate` bins the patients into three groups:

>>> cats = gt_predicate.get_categorizations()
>>> cats
(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT))

We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function,
and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`
that includes only the categories of interest:

>>> from gpsea.analysis.predicate.genotype import filtering_predicate
>>> fgt_predicate = filtering_predicate(
... predicate=gt_predicate,
... targets=(cats[1], cats[2]),
... )
>>> fgt_predicate.display_question()
'Which genotype group does the patient fit in: HET, BIALLELIC_ALT'


.. _groups-predicate:

Groups predicate
================

Expand Down
4 changes: 2 additions & 2 deletions src/gpsea/analysis/predicate/genotype/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from ._api import GenotypePolyPredicate
from ._api import VariantPredicate
from ._counter import AlleleCounter
from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate
from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, recessive_predicate
from ._gt_predicates import ModeOfInheritancePredicate
from ._variant import VariantPredicates, ProteinPredicates

__all__ = [
'GenotypePolyPredicate',
'boolean_predicate', 'groups_predicate', 'recessive_predicate',
'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'recessive_predicate',
'ModeOfInheritancePredicate',
'AlleleCounter', 'VariantPredicate',
'VariantPredicates', 'ProteinPredicates',
Expand Down
82 changes: 2 additions & 80 deletions src/gpsea/analysis/predicate/genotype/_api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import abc
import typing

from gpsea.model import Patient, Variant
from gpsea.model import Variant
from .._api import PolyPredicate, Categorization, PatientCategory


Expand All @@ -10,85 +10,7 @@ class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta
`GenotypePolyPredicate` is a base class for all :class:`PolyPredicate`
that test the genotype axis.
"""

@staticmethod
def filtering_predicate(
predicate: "GenotypePolyPredicate",
targets: typing.Collection[Categorization],
) -> "GenotypePolyPredicate":
"""
"""
return FilteringGenotypePolyPredicate.create(
predicate=predicate,
targets=targets,
)


class FilteringGenotypePolyPredicate(GenotypePolyPredicate):
# NOT PART OF THE PUBLIC API

@staticmethod
def create(
predicate: "GenotypePolyPredicate",
targets: typing.Collection[Categorization],
) -> "FilteringGenotypePolyPredicate":
# At least 2 target categorizations must be provided
if len(targets) <= 1:
raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}')

good_boys = tuple(isinstance(cat, Categorization) for cat in targets)
if not all(good_boys):
offenders = ', '.join(
str(i)
for i, is_instance
in enumerate(good_boys) if not is_instance
)
raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]')

# All `allowed` categorizations must in fact be present in the `base` predicate.
cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets)
if not all(cats_are_in_fact_present):
missing = ', '.join(
c.category.name
for c, is_present
in zip(targets, cats_are_in_fact_present) if not is_present
)
raise ValueError(f'Some from the categories are not present: {missing}')

if len(targets) == predicate.n_categorizations():
raise ValueError(
f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations '
f'with the same number ({len(targets)}) of targets'
)

return FilteringGenotypePolyPredicate(
predicate=predicate,
allowed=targets,
)

def __init__(
self,
predicate: "GenotypePolyPredicate",
allowed: typing.Iterable[Categorization],
):
self._predicate = predicate
self._allowed = tuple(allowed)

def get_categorizations(self) -> typing.Sequence[Categorization]:
return self._allowed

def get_question_base(self) -> str:
return self._predicate.get_question_base()

def test(self, patient: Patient) -> typing.Optional[Categorization]:
cat = self._predicate.test(patient)
if cat in self._allowed:
return cat
else:
return None

def __repr__(self):
return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})"
pass


class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta):
Expand Down
95 changes: 95 additions & 0 deletions src/gpsea/analysis/predicate/genotype/_gt_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ def groups_predicate(
The genotype groups *should* not overlap.
In case of an overlap, the patient will be assigned into no group (`None`).
See the :ref:`groups-predicate` section for an example.
:param predicates: an iterable with at least 2 variant predicates to determine a genotype group.
:param group_names: an iterable with group names. The number of group names must match the number of predicates.
"""
Expand Down Expand Up @@ -182,6 +184,99 @@ def groups_predicate(
)


class FilteringGenotypePolyPredicate(GenotypePolyPredicate):
# NOT PART OF THE PUBLIC API

@staticmethod
def create(
predicate: "GenotypePolyPredicate",
targets: typing.Collection[Categorization],
) -> "FilteringGenotypePolyPredicate":
# At least 2 target categorizations must be provided
if len(targets) <= 1:
raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}')

good_boys = tuple(isinstance(cat, Categorization) for cat in targets)
if not all(good_boys):
offenders = ', '.join(
str(i)
for i, is_instance
in enumerate(good_boys) if not is_instance
)
raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]')

# All `allowed` categorizations must in fact be present in the `base` predicate.
cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets)
if not all(cats_are_in_fact_present):
missing = ', '.join(
c.category.name
for c, is_present
in zip(targets, cats_are_in_fact_present) if not is_present
)
raise ValueError(f'Some from the categories are not present: {missing}')

if len(targets) == predicate.n_categorizations():
raise ValueError(
f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations '
f'with the same number ({len(targets)}) of targets'
)

return FilteringGenotypePolyPredicate(
predicate=predicate,
allowed=targets,
)

def __init__(
self,
predicate: "GenotypePolyPredicate",
allowed: typing.Iterable[Categorization],
):
self._predicate = predicate
self._allowed = tuple(allowed)

def get_categorizations(self) -> typing.Sequence[Categorization]:
return self._allowed

def get_question_base(self) -> str:
return self._predicate.get_question_base()

def test(self, patient: Patient) -> typing.Optional[Categorization]:
cat = self._predicate.test(patient)
if cat in self._allowed:
return cat
else:
return None

def __repr__(self):
return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})"


def filtering_predicate(
predicate: GenotypePolyPredicate,
targets: typing.Collection[Categorization],
) -> GenotypePolyPredicate:
"""
Filtering predicate applies the base `predicate` but only returns the categorizations
from the provided `targets` collection.
This can be useful if only some of the categorizations are interesting.
For instance, if we only seek to compare the differences between heterozygous and hemizygous variants,
but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups.
See the :ref:`filtering-predicate` section for an example.
The `predicate` is checked for being able to produce the all items in `targets`
and the `targets` must include at least 2 categorizations.
:param predicate: the base predicate whose categorizations are subject to filteration.
:param targets: the categorizations to retain
"""
return FilteringGenotypePolyPredicate.create(
predicate=predicate,
targets=targets,
)


class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate):
# NOT PART OF THE PUBLIC API
# TODO: this predicate is a bit weird and I think it should eventually go away.
Expand Down
9 changes: 5 additions & 4 deletions tests/analysis/predicate/genotype/test_gt_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from gpsea.analysis.predicate.genotype import (
GenotypePolyPredicate,
groups_predicate,
filtering_predicate,
VariantPredicates,
VariantPredicate,
ModeOfInheritancePredicate,
Expand Down Expand Up @@ -209,7 +210,7 @@ def test_filtering_predicate(
):
cats = x_recessive_gt_predicate.get_categorizations()
targets = [cats[i] for i in indices]
predicate = GenotypePolyPredicate.filtering_predicate(
predicate = filtering_predicate(
predicate=x_recessive_gt_predicate,
targets=targets,
)
Expand All @@ -223,7 +224,7 @@ def test_filtering_predicate__explodes_when_not_subsetting(
x_recessive_gt_predicate: GenotypePolyPredicate,
):
with pytest.raises(ValueError) as ve:
GenotypePolyPredicate.filtering_predicate(
filtering_predicate(
predicate=x_recessive_gt_predicate,
targets=x_recessive_gt_predicate.get_categorizations(),
)
Expand All @@ -238,7 +239,7 @@ def test_filtering_predicate__explodes_when_using_random_junk(
x_recessive_gt_predicate: GenotypePolyPredicate,
):
with pytest.raises(ValueError) as ve:
GenotypePolyPredicate.filtering_predicate(
filtering_predicate(
predicate=x_recessive_gt_predicate,
targets=(0, 1),
)
Expand All @@ -253,7 +254,7 @@ def test_filtering_predicate__explodes_when_using_one_category(
x_recessive_gt_predicate: GenotypePolyPredicate,
):
with pytest.raises(ValueError) as ve:
GenotypePolyPredicate.filtering_predicate(
filtering_predicate(
predicate=x_recessive_gt_predicate,
targets=(x_recessive_gt_predicate.get_categorizations()[0],),
)
Expand Down
1 change: 1 addition & 0 deletions tests/analysis/predicate/genotype/test_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ def test_protein_feature_id(

assert predicate.test(missense_variant) == expected


class TestLogicalVariantPredicate:
"""
Test that the AND and OR variant predicate combinators work as expected.
Expand Down

0 comments on commit 8ba618e

Please sign in to comment.