From 3f38f152fce9322aa5c86b65db17d490d6468c91 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 27 Aug 2024 21:03:52 +0200
Subject: [PATCH 01/16] Configure the protein metadata service in the
 `preprocessing` module.

---
 src/gpsea/analysis/_config.py       | 53 ++++++++-----------------
 src/gpsea/preprocessing/__init__.py |  4 +-
 src/gpsea/preprocessing/_config.py  | 61 ++++++++++++++++++++++++++---
 3 files changed, 74 insertions(+), 44 deletions(-)

diff --git a/src/gpsea/analysis/_config.py b/src/gpsea/analysis/_config.py
index 3fb67fc77..dd723347c 100644
--- a/src/gpsea/analysis/_config.py
+++ b/src/gpsea/analysis/_config.py
@@ -1,13 +1,15 @@
+import enum
 import logging
 import os
 import typing
-import enum
+import warnings
+
 import hpotk
 
 from gpsea.config import get_cache_dir_path
 from gpsea.model import Cohort
-from gpsea.preprocessing import ProteinMetadataService, UniprotProteinMetadataService, ProteinAnnotationCache, \
-    ProtCachingMetadataService
+from gpsea.preprocessing import ProteinMetadataService
+from gpsea.preprocessing import configure_default_protein_metadata_service as backup_pms
 from .mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter
 from ._api import CohortAnalysis
 from ._gp_analysis import FisherExactAnalyzer
@@ -264,7 +266,11 @@ def configure_cohort_analysis(
     """
     if config is None:
         config = CohortAnalysisConfiguration()  # Use the default config
-    cache_dir = _configure_cache_dir(cache_dir)
+
+    cache_path = get_cache_dir_path(cache_dir)
+    os.makedirs(cache_path, exist_ok=True)
+
+    cache_dir = str(cache_path)
     protein_metadata_service = configure_default_protein_metadata_service(protein_source, cache_dir)
 
     mtc_filter: PhenotypeMtcFilter
@@ -338,36 +344,9 @@ def configure_default_protein_metadata_service(
     in current working directory under `.gpsea_cache/protein_cache`
     and reach out to UNIPROT REST API if a cache entry is missing.
     """
-    cache_dir = _configure_cache_dir(cache_dir)
-    return _configure_protein_service(protein_fallback=protein_source, cache_dir=cache_dir)
-
-
-def _configure_protein_service(
-    protein_fallback: str,
-    cache_dir: str,
-) -> ProteinMetadataService:
-    # (1) ProteinMetadataService
-    # Setup fallback
-    protein_fallback = _configure_fallback_protein_service(protein_fallback)
-    # Setup protein metadata cache
-    prot_cache_dir = os.path.join(cache_dir, 'protein_cache')
-    os.makedirs(prot_cache_dir, exist_ok=True)
-    prot_cache = ProteinAnnotationCache(prot_cache_dir)
-    # Assemble the final protein metadata service
-    protein_metadata_service = ProtCachingMetadataService(prot_cache, protein_fallback)
-    return protein_metadata_service
-
-
-def _configure_fallback_protein_service(protein_fallback: str) -> ProteinMetadataService:
-    if protein_fallback == 'UNIPROT':
-        fallback1 = UniprotProteinMetadataService()
-    else:
-        raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}')
-    return fallback1
-
-
-def _configure_cache_dir(cache_dir: typing.Optional[str]) -> str:
-    cache_path = get_cache_dir_path(cache_dir)
-    os.makedirs(cache_path, exist_ok=True)
-
-    return str(cache_path)
+    # TODO: remove at some point.
+    warnings.warn(
+        'Use gpsea.preprocessing.configure_default_protein_metadata_service` instead',
+        DeprecationWarning, stacklevel=2,
+    )
+    return backup_pms(protein_source=protein_source, cache_dir=cache_dir)
diff --git a/src/gpsea/preprocessing/__init__.py b/src/gpsea/preprocessing/__init__.py
index b4367167b..1551b4d63 100644
--- a/src/gpsea/preprocessing/__init__.py
+++ b/src/gpsea/preprocessing/__init__.py
@@ -4,7 +4,7 @@
 from ._config import configure_caching_patient_creator, configure_patient_creator
 from ._config import load_phenopacket_folder, load_phenopackets
 from ._config import configure_caching_cohort_creator, configure_cohort_creator
-from ._config import configure_protein_metadata_service
+from ._config import configure_default_protein_metadata_service, configure_protein_metadata_service
 from ._generic import DefaultImpreciseSvFunctionalAnnotator
 from ._patient import PatientCreator, CohortCreator
 from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator
@@ -18,7 +18,7 @@
 __all__ = [
     'configure_caching_patient_creator', 'configure_patient_creator',
     'configure_caching_cohort_creator', 'configure_cohort_creator',
-    'configure_protein_metadata_service',
+    'configure_default_protein_metadata_service', 'configure_protein_metadata_service',
     'VariantCoordinateFinder', 'FunctionalAnnotator', 'ImpreciseSvFunctionalAnnotator', 'ProteinMetadataService',
     'PatientCreator', 'CohortCreator',
     'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder', 'load_phenopackets',
diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py
index 67c14eab3..348bd5b1c 100644
--- a/src/gpsea/preprocessing/_config.py
+++ b/src/gpsea/preprocessing/_config.py
@@ -225,14 +225,65 @@ def configure_protein_metadata_service(
      In any case, the directory will be created if it does not exist (including any non-existing parents).
     :param timeout: timeout in seconds for the REST APIs.
     """
+    warnings.warn('Use `configure_default_protein_metadata_service` instead', DeprecationWarning, stacklevel=2)
+    return configure_default_protein_metadata_service(
+        cache_dir=cache_dir,
+        timeout=timeout,
+    )
+
+
+def configure_default_protein_metadata_service(
+    protein_source: typing.Literal['UNIPROT'] = 'UNIPROT',
+    cache_dir: typing.Optional[str] = None,
+    timeout: float = 30.,
+) -> ProteinMetadataService:
+    """
+    Create default protein metadata service that will cache the protein metadata
+    in current working directory under `.gpsea_cache/protein_cache`
+    and reach out to UNIPROT REST API if a cache entry is missing.
+
+    :param protein_source: a `str` with the code of the protein data sources (currently accepting just `UNIPROT`).
+    :param cache_dir: path to the folder where we will cache the results fetched from the remote APIs or `None`
+        if the data should be cached as described by :func:`~gpsea.config.get_cache_dir_path` function.
+        In any case, the directory will be created if it does not exist (including any non-existing parents).
+    :param timeout: timeout in seconds for the REST APIs.
+    """
     cache_dir = _configure_cache_dir(cache_dir)
+    return _configure_protein_service(
+        protein_fallback=protein_source,
+        cache_dir=cache_dir,
+        timeout=timeout,
+    )
 
-    protein_cache_dir = os.path.join(cache_dir, "protein_cache")
-    os.makedirs(protein_cache_dir, exist_ok=True)
 
-    cache = ProteinAnnotationCache(protein_cache_dir)
-    fallback = UniprotProteinMetadataService(timeout=timeout)
-    return ProtCachingMetadataService(cache=cache, fallback=fallback)
+def _configure_protein_service(
+    protein_fallback: str,
+    cache_dir: str,
+    timeout: float,
+) -> ProteinMetadataService:
+    # (1) ProteinMetadataService
+    # Setup fallback
+    protein_fallback = _configure_fallback_protein_service(
+        protein_fallback,
+        timeout,
+    )
+    # Setup protein metadata cache
+    prot_cache_dir = os.path.join(cache_dir, 'protein_cache')
+    os.makedirs(prot_cache_dir, exist_ok=True)
+    prot_cache = ProteinAnnotationCache(prot_cache_dir)
+    # Assemble the final protein metadata service
+    protein_metadata_service = ProtCachingMetadataService(prot_cache, protein_fallback)
+    return protein_metadata_service
+
+
+def _configure_fallback_protein_service(
+    protein_fallback: str,
+    timeout: float,
+) -> ProteinMetadataService:
+    if protein_fallback == 'UNIPROT':
+        return UniprotProteinMetadataService(timeout)
+    else:
+        raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}')
 
 
 def _configure_cache_dir(

From d641c0835dc0e5f9c7af055179c1f510d876c648 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 28 Aug 2024 10:47:06 +0200
Subject: [PATCH 02/16] Add `KaryotypicSex`.

---
 src/gpsea/model/__init__.py | 10 +++---
 src/gpsea/model/_base.py    | 63 +++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/src/gpsea/model/__init__.py b/src/gpsea/model/__init__.py
index 2530b4987..d1b8e8925 100644
--- a/src/gpsea/model/__init__.py
+++ b/src/gpsea/model/__init__.py
@@ -5,19 +5,21 @@
 and we follow with data classes for phenotype, genotype, transcript, and protein info.
 """
 
-from ._base import SampleLabels
+from ._base import SampleLabels, KaryotypicSex
 from ._cohort import Cohort, Patient
 from ._gt import Genotype, Genotypes, Genotyped
 from ._phenotype import Phenotype, Disease
 from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata
 from ._tx import TranscriptCoordinates
-from ._variant import VariantCoordinates, ImpreciseSvInfo, VariantInfo,  VariantInfoAware, TranscriptAnnotation, TranscriptInfoAware, VariantClass, Variant
+from ._variant import VariantCoordinates, ImpreciseSvInfo, VariantInfo, VariantInfoAware, VariantClass, Variant
+from ._variant import TranscriptAnnotation, TranscriptInfoAware
 from ._variant_effects import VariantEffect
 
 __all__ = [
-    'Cohort', 'Patient', 'SampleLabels',
+    'Cohort', 'Patient', 'SampleLabels', 'KaryotypicSex',
     'Phenotype', 'Disease',
-    'Variant', 'VariantClass', 'VariantCoordinates', 'ImpreciseSvInfo', 'VariantInfo', 'VariantInfoAware', 'Genotype', 'Genotypes', 'Genotyped',
+    'Variant', 'VariantClass', 'VariantCoordinates', 'ImpreciseSvInfo', 'VariantInfo', 'VariantInfoAware',
+    'Genotype', 'Genotypes', 'Genotyped',
     'TranscriptAnnotation', 'VariantEffect', 'TranscriptInfoAware', 'TranscriptCoordinates',
     'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType',
 ]
diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py
index 36716831e..31ca49c80 100644
--- a/src/gpsea/model/_base.py
+++ b/src/gpsea/model/_base.py
@@ -1,8 +1,71 @@
+import enum
 import typing
 
 import hpotk
 
 
+class KaryotypicSex(enum.Enum):
+    """
+    `KaryotypicSex`, as included in
+    `Phenopacket Schema <https://phenopacket-schema.readthedocs.io/en/2.0.0/karyotypicsex.html>`_
+    """
+    
+    UNKNOWN_KARYOTYPE = 0
+    """
+    Untyped or inconclusive karyotyping.
+    """
+    
+    XX = 1
+    """
+    Female.
+    """
+
+    XY = 2
+    """
+    Male.
+    """
+    
+    XO = 3
+    """
+    Single X chromosome.
+    """
+    
+    XXY = 4
+    """
+    Two X and one Y chromosome.
+    """
+    
+    XXX = 5
+    """
+    Three X chromosomes
+    """
+
+    XXYY = 6
+    """
+    Two X chromosomes and two Y chromosomes.
+    """
+
+    XXXY = 7
+    """
+    Three X chromosomes and one Y chromosome.
+    """
+    
+    XXXX = 8
+    """
+    Four X chromosomes.
+    """
+
+    XYY = 9
+    """
+    One X and two Y chromosomes.
+    """
+
+    OTHER_KARYOTYPE = 10
+    """
+    None of the above types.
+    """
+
+
 class SampleLabels:
     """
     A data model for subject identifiers.

From 9613f9416bc4fde27c5898e4e0593798540ac578 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 28 Aug 2024 21:18:30 +0200
Subject: [PATCH 03/16] Add `Sex` to `Patient`.

---
 src/gpsea/analysis/_config.py                 |   2 +-
 src/gpsea/data/_toy.py                        | 462 +++++++++++-------
 src/gpsea/io.py                               |   8 +-
 src/gpsea/model/__init__.py                   |   4 +-
 src/gpsea/model/_base.py                      |  57 +--
 src/gpsea/model/_cohort.py                    |  48 +-
 src/gpsea/preprocessing/_audit.py             |   2 +-
 src/gpsea/preprocessing/_phenopacket.py       | 220 ++++++---
 tests/analysis/conftest.py                    |   6 +-
 tests/analysis/predicate/genotype/conftest.py |   6 +-
 .../predicate/genotype/test_allele_counter.py |   3 +-
 tests/analysis/test_pscore.py                 |   5 +-
 tests/conftest.py                             |  83 ++--
 tests/test_data/SUOX.json                     |  35 ++
 tests/test_io.py                              |   1 -
 15 files changed, 593 insertions(+), 349 deletions(-)

diff --git a/src/gpsea/analysis/_config.py b/src/gpsea/analysis/_config.py
index dd723347c..2c6d68c63 100644
--- a/src/gpsea/analysis/_config.py
+++ b/src/gpsea/analysis/_config.py
@@ -271,7 +271,7 @@ def configure_cohort_analysis(
     os.makedirs(cache_path, exist_ok=True)
 
     cache_dir = str(cache_path)
-    protein_metadata_service = configure_default_protein_metadata_service(protein_source, cache_dir)
+    protein_metadata_service = backup_pms(protein_source, cache_dir)
 
     mtc_filter: PhenotypeMtcFilter
     if config.mtc_strategy == MtcStrategy.HPO_MTC:
diff --git a/src/gpsea/data/_toy.py b/src/gpsea/data/_toy.py
index e9b80234e..24a3b9c4b 100644
--- a/src/gpsea/data/_toy.py
+++ b/src/gpsea/data/_toy.py
@@ -3,7 +3,7 @@
 from gpsea.model import *
 from gpsea.model.genome import Contig, GenomicRegion, Strand
 
-CONTIG = Contig('1', 'GB_ACC', 'REFSEQ_NAME', 'UCSC_NAME', 1_000)
+CONTIG = Contig("1", "GB_ACC", "REFSEQ_NAME", "UCSC_NAME", 1_000)
 
 
 def make_region(start: int, end: int) -> GenomicRegion:
@@ -22,189 +22,293 @@ def get_toy_cohort() -> Cohort:
     # - Spasticity HP:0001257
     # - Chronic pancreatitis HP:0006280
 
-    arachnodactyly_T = Phenotype(TermId.from_curie('HP:0001166'), True)
-    focal_clonic_seizure_T = Phenotype(TermId.from_curie('HP:0002266'), True)
-    seizure_T = Phenotype(TermId.from_curie('HP:0001250'), True)
-    spasticity_T = Phenotype(TermId.from_curie('HP:0001257'), True)
-    arachnodactyly_F = Phenotype(TermId.from_curie('HP:0001166'), False)
-    focal_clonic_seizure_F = Phenotype(TermId.from_curie('HP:0002266'), False)
-    seizure_F = Phenotype(TermId.from_curie('HP:0001250'), False)
-    spasticity_F = Phenotype(TermId.from_curie('HP:0001257'), False)
-    
-    Disease_T = Disease(TermId.from_curie('OMIM:001234'), "Test present disease", True)
-    Disease_F = Disease(TermId.from_curie('OMIM:001234'), "Test absent disease", False)
+    arachnodactyly_T = Phenotype(TermId.from_curie("HP:0001166"), True)
+    focal_clonic_seizure_T = Phenotype(TermId.from_curie("HP:0002266"), True)
+    seizure_T = Phenotype(TermId.from_curie("HP:0001250"), True)
+    spasticity_T = Phenotype(TermId.from_curie("HP:0001257"), True)
+    arachnodactyly_F = Phenotype(TermId.from_curie("HP:0001166"), False)
+    focal_clonic_seizure_F = Phenotype(TermId.from_curie("HP:0002266"), False)
+    seizure_F = Phenotype(TermId.from_curie("HP:0001250"), False)
+    spasticity_F = Phenotype(TermId.from_curie("HP:0001257"), False)
 
-    snv = Variant.create_variant_from_scratch(VariantCoordinates(make_region(280, 281), 'A', 'G', 0), 'FakeGene',
-                                                  'NM_1234.5', 'NM_1234.5:c.180A>G', False, [VariantEffect.MISSENSE_VARIANT], [1],
-                                                  'NP_09876.5', 'NP_09876.5:p.Unk200', 60, 60,
-                                                  Genotypes.from_mapping({
-                                                      SampleLabels('A'): Genotype.HETEROZYGOUS, SampleLabels('B'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('C'): Genotype.HOMOZYGOUS_ALTERNATE,
-                                                      SampleLabels('D'): Genotype.HETEROZYGOUS, SampleLabels('E'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('G'): Genotype.HETEROZYGOUS, SampleLabels('J'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('K'): Genotype.HETEROZYGOUS, SampleLabels('M'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('N'): Genotype.HOMOZYGOUS_ALTERNATE,
-                                                      SampleLabels('P'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('Q'): Genotype.HOMOZYGOUS_ALTERNATE,
-                                                      SampleLabels('R'): Genotype.HETEROZYGOUS, SampleLabels('T'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('V'): Genotype.HETEROZYGOUS, SampleLabels('Y'): Genotype.HETEROZYGOUS,
-                                                  })
-                                              )
-    deletion = Variant.create_variant_from_scratch(VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2),
-                                                  'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del',
-                                                  False, [VariantEffect.FRAMESHIFT_VARIANT],
-                                                  [2], 'NP_09876.5','NP_09876.5:p.Unk200', 86, 87,
-                                                  Genotypes.from_mapping({
-                                                      SampleLabels('D'): Genotype.HETEROZYGOUS, SampleLabels('F'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('G'): Genotype.HETEROZYGOUS, SampleLabels('H'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('I'): Genotype.HOMOZYGOUS_ALTERNATE,
-                                                      SampleLabels('L'): Genotype.HETEROZYGOUS, SampleLabels('O'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('R'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('S'): Genotype.HOMOZYGOUS_ALTERNATE,
-                                                      SampleLabels('U'): Genotype.HETEROZYGOUS, SampleLabels('W'): Genotype.HETEROZYGOUS,
-                                                      SampleLabels('X'): Genotype.HETEROZYGOUS, SampleLabels('Z'): Genotype.HETEROZYGOUS,
-                                                  })
-                                                   )
+    Disease_T = Disease(TermId.from_curie("OMIM:001234"), "Test present disease", True)
+    Disease_F = Disease(TermId.from_curie("OMIM:001234"), "Test absent disease", False)
+
+    snv = Variant.create_variant_from_scratch(
+        VariantCoordinates(make_region(280, 281), "A", "G", 0),
+        "FakeGene",
+        "NM_1234.5",
+        "NM_1234.5:c.180A>G",
+        False,
+        [VariantEffect.MISSENSE_VARIANT],
+        [1],
+        "NP_09876.5",
+        "NP_09876.5:p.Unk200",
+        60,
+        60,
+        Genotypes.from_mapping(
+            {
+                SampleLabels("A"): Genotype.HETEROZYGOUS,
+                SampleLabels("B"): Genotype.HETEROZYGOUS,
+                SampleLabels("C"): Genotype.HOMOZYGOUS_ALTERNATE,
+                SampleLabels("D"): Genotype.HETEROZYGOUS,
+                SampleLabels("E"): Genotype.HETEROZYGOUS,
+                SampleLabels("G"): Genotype.HETEROZYGOUS,
+                SampleLabels("J"): Genotype.HETEROZYGOUS,
+                SampleLabels("K"): Genotype.HETEROZYGOUS,
+                SampleLabels("M"): Genotype.HETEROZYGOUS,
+                SampleLabels("N"): Genotype.HOMOZYGOUS_ALTERNATE,
+                SampleLabels("P"): Genotype.HETEROZYGOUS,
+                SampleLabels("Q"): Genotype.HOMOZYGOUS_ALTERNATE,
+                SampleLabels("R"): Genotype.HETEROZYGOUS,
+                SampleLabels("T"): Genotype.HETEROZYGOUS,
+                SampleLabels("V"): Genotype.HETEROZYGOUS,
+                SampleLabels("Y"): Genotype.HETEROZYGOUS,
+            }
+        ),
+    )
+    deletion = Variant.create_variant_from_scratch(
+        VariantCoordinates(make_region(360, 363), "TTC", "T", -2),
+        "FakeGene",
+        "NM_1234.5",
+        "NM_1234.5:c.261_263del",
+        False,
+        [VariantEffect.FRAMESHIFT_VARIANT],
+        [2],
+        "NP_09876.5",
+        "NP_09876.5:p.Unk200",
+        86,
+        87,
+        Genotypes.from_mapping(
+            {
+                SampleLabels("D"): Genotype.HETEROZYGOUS,
+                SampleLabels("F"): Genotype.HETEROZYGOUS,
+                SampleLabels("G"): Genotype.HETEROZYGOUS,
+                SampleLabels("H"): Genotype.HETEROZYGOUS,
+                SampleLabels("I"): Genotype.HOMOZYGOUS_ALTERNATE,
+                SampleLabels("L"): Genotype.HETEROZYGOUS,
+                SampleLabels("O"): Genotype.HETEROZYGOUS,
+                SampleLabels("R"): Genotype.HETEROZYGOUS,
+                SampleLabels("S"): Genotype.HOMOZYGOUS_ALTERNATE,
+                SampleLabels("U"): Genotype.HETEROZYGOUS,
+                SampleLabels("W"): Genotype.HETEROZYGOUS,
+                SampleLabels("X"): Genotype.HETEROZYGOUS,
+                SampleLabels("Z"): Genotype.HETEROZYGOUS,
+            }
+        ),
+    )
     het_dup = Variant.create_variant_from_scratch(
-        VariantCoordinates(make_region(175, 176), 'T', 'TG', 1), 'FakeGene', 'NM_1234.5',
-        'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], 'NP_09876.5', 'NP_09876.5:p.Unk200', 25, 25,
-        Genotypes.empty())  # Not used in the patients below, hence `empty()`.
+        VariantCoordinates(make_region(175, 176), "T", "TG", 1),
+        "FakeGene",
+        "NM_1234.5",
+        "NM_1234.5:c.75A>G",
+        False,
+        [VariantEffect.FRAMESHIFT_VARIANT],
+        [1],
+        "NP_09876.5",
+        "NP_09876.5:p.Unk200",
+        25,
+        25,
+        Genotypes.empty(),
+    )  # Not used in the patients below, hence `empty()`.
     hom_dup = Variant.create_variant_from_scratch(
-        VariantCoordinates(make_region(175, 176), 'T', 'TG', 1),'FakeGene', 'NM_1234.5',
-        'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], 'NP_09876.5', 'NP_09876.5:p.Unk200', 25, 25,
-        Genotypes.empty())  # Not used in the patients below, hence `empty()`.
+        VariantCoordinates(make_region(175, 176), "T", "TG", 1),
+        "FakeGene",
+        "NM_1234.5",
+        "NM_1234.5:c.75A>G",
+        False,
+        [VariantEffect.FRAMESHIFT_VARIANT],
+        [1],
+        "NP_09876.5",
+        "NP_09876.5:p.Unk200",
+        25,
+        25,
+        Genotypes.empty(),
+    )  # Not used in the patients below, hence `empty()`.
 
     patients = (
-        Patient(SampleLabels('A'),
-                phenotypes=(arachnodactyly_T, spasticity_F, seizure_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('B'),
-                phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('C'),
-                phenotypes=(arachnodactyly_F, spasticity_T, seizure_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('D'),
-                phenotypes=(arachnodactyly_T, spasticity_T, seizure_T),
-                variants=[snv, deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('E'),
-                phenotypes=(arachnodactyly_T, spasticity_T, seizure_F),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('F'),
-                phenotypes=(arachnodactyly_F, spasticity_F, seizure_T),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('G'),
-                phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
-                variants=[snv, deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('H'),
-                phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('I'),
-                phenotypes=(arachnodactyly_F, spasticity_F, seizure_T),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('J'),
-                phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('K'),
-                phenotypes=(arachnodactyly_F, spasticity_T, seizure_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('L'),
-                phenotypes=(arachnodactyly_F, seizure_F, spasticity_F),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('M'),
-                phenotypes=(arachnodactyly_T, seizure_F, spasticity_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('N'),
-                phenotypes=(arachnodactyly_F, seizure_T, spasticity_F),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('O'),
-                phenotypes=(arachnodactyly_F, seizure_F, spasticity_T),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('P'),
-                phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('Q'),
-                phenotypes=(arachnodactyly_T, seizure_F, spasticity_F),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('R'),
-                phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
-                variants=[snv, deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('S'),
-                phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('T'),
-                phenotypes=(arachnodactyly_T, seizure_F, spasticity_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('U'),
-                phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('V'),
-                phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('W'),
-                phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('X'),
-                phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('Y'),
-                phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
-                variants=[snv],
-                diseases=[Disease_T]
-                ),
-        Patient(SampleLabels('Z'),
-                phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
-                variants=[deletion],
-                diseases=[Disease_T]
-                ),
+        Patient.from_raw_parts(
+            SampleLabels("A"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, spasticity_F, seizure_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("B"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("C"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, spasticity_T, seizure_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("D"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, spasticity_T, seizure_T),
+            variants=[snv, deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("E"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, spasticity_T, seizure_F),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("F"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, spasticity_F, seizure_T),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("G"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+            variants=[snv, deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("H"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("I"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, spasticity_F, seizure_T),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("J"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("K"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, spasticity_T, seizure_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("L"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, seizure_F, spasticity_F),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("M"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_F, spasticity_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("N"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, seizure_T, spasticity_F),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("O"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, seizure_F, spasticity_T),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("P"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("Q"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_F, spasticity_F),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("R"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
+            variants=[snv, deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("S"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("T"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_F, spasticity_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("U"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("V"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("W"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("X"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("Y"),
+            sex=None,
+            phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+            variants=[snv],
+            diseases=[Disease_T],
+        ),
+        Patient.from_raw_parts(
+            SampleLabels("Z"),
+            sex=None,
+            phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+            variants=[deletion],
+            diseases=[Disease_T],
+        ),
     )
 
     return Cohort.from_patients(patients)
diff --git a/src/gpsea/io.py b/src/gpsea/io.py
index a9b300784..c5b3aea18 100644
--- a/src/gpsea/io.py
+++ b/src/gpsea/io.py
@@ -23,7 +23,7 @@ def default(self, o):
                 'tx_annotations': o.tx_annotations,
                 'genotypes': o.genotypes,
             }
-        elif isinstance(o, VariantInfo): 
+        elif isinstance(o, VariantInfo):
             return {
                 'variant_coordinates': o.variant_coordinates,
                 'sv_info': o.sv_info,
@@ -87,7 +87,7 @@ def default(self, o):
                 'label': o.label,
                 'meta_label': o.meta_label,
             }
-        elif isinstance(o, (Genotype, VariantEffect, Strand, VariantClass)):
+        elif isinstance(o, (Sex, Genotype, VariantEffect, Strand, VariantClass)):
             # enums
             return o.name
         elif isinstance(o, Phenotype):
@@ -104,6 +104,7 @@ def default(self, o):
         elif isinstance(o, Patient):
             return {
                 'labels': o.labels,
+                'sex': o.sex,
                 'phenotypes': o.phenotypes,
                 'diseases': o.diseases,
                 'variants': o.variants,
@@ -161,7 +162,7 @@ def default(self, o):
 _FEATURE_INFO = ('name', 'region')
 _PHENOTYPE_FIELDS = ('term_id', 'is_present')
 _DISEASE_FIELDS = ('term_id', 'name', 'is_observed')
-_PATIENT_FIELDS = ('labels', 'phenotypes', 'diseases', 'variants')
+_PATIENT_FIELDS = ('labels', 'sex', 'phenotypes', 'diseases', 'variants')
 _COHORT_FIELDS = ('members', 'excluded_patient_count')
 
 
@@ -281,6 +282,7 @@ def object_hook(obj: typing.Dict[typing.Any, typing.Any]) -> typing.Any:
         elif GpseaJSONDecoder._has_all_fields(obj, _PATIENT_FIELDS):
             return Patient(
                 labels=obj['labels'],
+                sex=Sex[obj['sex']],
                 phenotypes=obj['phenotypes'],
                 diseases=obj['diseases'],
                 variants=obj['variants'],
diff --git a/src/gpsea/model/__init__.py b/src/gpsea/model/__init__.py
index d1b8e8925..1d17138c5 100644
--- a/src/gpsea/model/__init__.py
+++ b/src/gpsea/model/__init__.py
@@ -5,7 +5,7 @@
 and we follow with data classes for phenotype, genotype, transcript, and protein info.
 """
 
-from ._base import SampleLabels, KaryotypicSex
+from ._base import SampleLabels, Sex
 from ._cohort import Cohort, Patient
 from ._gt import Genotype, Genotypes, Genotyped
 from ._phenotype import Phenotype, Disease
@@ -16,7 +16,7 @@
 from ._variant_effects import VariantEffect
 
 __all__ = [
-    'Cohort', 'Patient', 'SampleLabels', 'KaryotypicSex',
+    'Cohort', 'Patient', 'SampleLabels', 'Sex',
     'Phenotype', 'Disease',
     'Variant', 'VariantClass', 'VariantCoordinates', 'ImpreciseSvInfo', 'VariantInfo', 'VariantInfoAware',
     'Genotype', 'Genotypes', 'Genotyped',
diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py
index 31ca49c80..0effeab06 100644
--- a/src/gpsea/model/_base.py
+++ b/src/gpsea/model/_base.py
@@ -4,65 +4,26 @@
 import hpotk
 
 
-class KaryotypicSex(enum.Enum):
-    """
-    `KaryotypicSex`, as included in
-    `Phenopacket Schema <https://phenopacket-schema.readthedocs.io/en/2.0.0/karyotypicsex.html>`_
-    """
-    
-    UNKNOWN_KARYOTYPE = 0
-    """
-    Untyped or inconclusive karyotyping.
-    """
-    
-    XX = 1
-    """
-    Female.
-    """
-
-    XY = 2
-    """
-    Male.
+class Sex(enum.Enum):
     """
+    `Sex` represents typical “phenotypic sex”, as would be determined by a midwife or physician at birth.
     
-    XO = 3
-    """
-    Single X chromosome.
+    The definition is aligned with `Phenopacket Schema <https://phenopacket-schema.readthedocs.io/en/2.0.0/sex.html>`_
     """
     
-    XXY = 4
+    UNKNOWN_SEX = 0
     """
-    Two X and one Y chromosome.
+    Not assessed or not available. Maps to `NCIT:C17998`.
     """
     
-    XXX = 5
-    """
-    Three X chromosomes
-    """
-
-    XXYY = 6
-    """
-    Two X chromosomes and two Y chromosomes.
-    """
-
-    XXXY = 7
-    """
-    Three X chromosomes and one Y chromosome.
-    """
-    
-    XXXX = 8
-    """
-    Four X chromosomes.
-    """
-
-    XYY = 9
+    FEMALE = 1
     """
-    One X and two Y chromosomes.
+    Female sex. Maps to `NCIT:C46113`.
     """
 
-    OTHER_KARYOTYPE = 10
+    MALE = 2
     """
-    None of the above types.
+    Male sex. Maps to `NCIT:C46112`.
     """
 
 
diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py
index 3209748e8..03fcf08e4 100644
--- a/src/gpsea/model/_cohort.py
+++ b/src/gpsea/model/_cohort.py
@@ -5,7 +5,7 @@
 
 import hpotk
 
-from ._base import SampleLabels
+from ._base import SampleLabels, Sex
 from ._phenotype import Phenotype, Disease
 from ._variant import Variant
 
@@ -13,16 +13,49 @@
 class Patient:
     """
     `Patient` represents a single investigated individual.
+
+    .. note::
+    
+        We strongly recommend using the :func:`from_raw_parts` static constructor
+        instead of `__init__`.
     """
 
+    @staticmethod
+    def from_raw_parts(
+        labels: SampleLabels,
+        sex: typing.Optional[Sex],
+        phenotypes: typing.Iterable[Phenotype],
+        diseases: typing.Iterable[Disease],
+        variants: typing.Iterable[Variant]
+    ) -> "Patient":
+        """
+        Create `Patient` from the primary data.
+        """
+        if sex is None:
+            sex = Sex.UNKNOWN_SEX
+        
+        return Patient(
+            labels=labels,
+            sex=sex,
+            phenotypes=phenotypes,
+            diseases=diseases,
+            variants=variants,
+        )
+
     def __init__(
         self,
         labels: SampleLabels,
+        sex: Sex,
         phenotypes: typing.Iterable[Phenotype],
         diseases: typing.Iterable[Disease],
         variants: typing.Iterable[Variant]
     ):
+        assert isinstance(labels, SampleLabels)
         self._labels = labels
+        
+        assert isinstance(sex, Sex)
+        self._sex = sex
+
         self._phenotypes = tuple(phenotypes)
         self._diseases = tuple(diseases)
         self._variants = tuple(variants)
@@ -41,6 +74,13 @@ def labels(self) -> SampleLabels:
         """
         return self._labels
 
+    @property
+    def sex(self) -> Sex:
+        """
+        Get the "phenotype sex" of the sample.
+        """
+        return self._sex
+
     @property
     def phenotypes(self) -> typing.Sequence[Phenotype]:
         """
@@ -89,6 +129,7 @@ def excluded_diseases(self) -> typing.Iterator[Disease]:
     def __str__(self) -> str:
         return (f"Patient("
                 f"labels:{self._labels}, "
+                f"sex:{self._sex}, "
                 f"variants:{self.variants}, "
                 f"phenotypes:{[pheno.identifier for pheno in self.phenotypes]}, "
                 f"diseases:{[dis.identifier for dis in self.diseases]}")
@@ -99,12 +140,13 @@ def __repr__(self) -> str:
     def __eq__(self, other) -> bool:
         return (isinstance(other, Patient)
                 and self._labels == other._labels
+                and self._sex == other._sex
                 and self._variants == other._variants
                 and self._phenotypes == other._phenotypes
                 and self._diseases == other._diseases)
 
     def __hash__(self) -> int:
-        return hash((self._labels, self._variants, self._phenotypes, self._diseases))
+        return hash((self._labels, self._sex, self._variants, self._phenotypes, self._diseases))
 
 
 class Cohort(typing.Sized, typing.Iterable[Patient]):
@@ -201,7 +243,7 @@ def list_present_phenotypes(
                 Otherwise, lists only the `top` highest counts
         
         Returns:
-            typing.Sequence[typing.Tuple[str, int]]: A sequence of tuples, formatted (phenotype CURIE, 
+            typing.Sequence[typing.Tuple[str, int]]: A sequence of tuples, formatted (phenotype CURIE,
                 number of patients with that phenotype)
         """
         counter = Counter()
diff --git a/src/gpsea/preprocessing/_audit.py b/src/gpsea/preprocessing/_audit.py
index f4ab6597d..3bdafe97e 100644
--- a/src/gpsea/preprocessing/_audit.py
+++ b/src/gpsea/preprocessing/_audit.py
@@ -165,7 +165,7 @@ def __init__(self, label: str):
 
 
     @abc.abstractmethod
-    def add_subsection(self, label: str):
+    def add_subsection(self, label: str) -> "Notepad":
         """
         Add a labeled subsection.
 
diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py
index abe83fbcc..136825bd7 100644
--- a/src/gpsea/preprocessing/_phenopacket.py
+++ b/src/gpsea/preprocessing/_phenopacket.py
@@ -4,15 +4,28 @@
 import hpotk
 
 from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket
+import phenopackets.schema.v2.core.individual_pb2 as ppi
 from phenopackets.schema.v2.core.disease_pb2 import Disease as PPDisease
 from phenopackets.schema.v2.core.interpretation_pb2 import GenomicInterpretation
 from phenopackets.vrsatile.v1.vrsatile_pb2 import VcfRecord, VariationDescriptor
 from phenopackets.vrs.v1.vrs_pb2 import Variation
 
-from gpsea.model import Patient, SampleLabels, Disease
-from gpsea.model import  VariantClass, VariantCoordinates, ImpreciseSvInfo, VariantInfo, Variant, Genotype, Genotypes 
+from gpsea.model import SampleLabels, Patient, Sex, Disease
+from gpsea.model import (
+    VariantClass,
+    VariantCoordinates,
+    ImpreciseSvInfo,
+    VariantInfo,
+    Variant,
+    Genotype,
+    Genotypes,
+)
 from gpsea.model.genome import GenomeBuild, GenomicRegion, Strand
-from ._api import VariantCoordinateFinder, FunctionalAnnotator, ImpreciseSvFunctionalAnnotator
+from ._api import (
+    VariantCoordinateFinder,
+    FunctionalAnnotator,
+    ImpreciseSvFunctionalAnnotator,
+)
 from ._audit import Notepad
 from ._patient import PatientCreator
 from ._phenotype import PhenotypeCreator
@@ -27,14 +40,14 @@ def find_genotype(
         self,
         item: GenomicInterpretation,
     ) -> typing.Optional[Genotype]:
-        if item.HasField('variant_interpretation'):
+        if item.HasField("variant_interpretation"):
             variant_interpretation = item.variant_interpretation
-            if variant_interpretation.HasField('variation_descriptor'):
+            if variant_interpretation.HasField("variation_descriptor"):
                 variation_descriptor = variant_interpretation.variation_descriptor
-                if variation_descriptor.HasField('allelic_state'):
+                if variation_descriptor.HasField("allelic_state"):
                     genotype = variation_descriptor.allelic_state.label
                     return self._map_geno_genotype_label(genotype)
-        
+
         return None
 
     @staticmethod
@@ -85,7 +98,7 @@ def find_coordinates(
 
         Args:
             item (GenomicInterpretation): a genomic interpretation element from Phenopacket Schema
-        
+
         Returns:
             typing.Optional[VariantCoordinates]: variant coordinates
         """
@@ -102,8 +115,9 @@ def find_coordinates(
                 variation_descriptor.vcf_record.genome_assembly
             ):
                 raise ValueError(
-                    f"Variant id {variation_descriptor.id} for patient {item.subject_or_biosample_id} has a different Genome Assembly than what was given. "
-                    + f"{variation_descriptor.vcf_record.genome_assembly} is not {self._build.identifier}."
+                    f"Variant id {variation_descriptor.id} for patient {item.subject_or_biosample_id} "
+                    "has a different Genome Assembly than what was given. "
+                    f"{variation_descriptor.vcf_record.genome_assembly} is not {self._build.identifier}."
                 )
             contig = self._build.contig_by_name(variation_descriptor.vcf_record.chrom)
             start = int(variation_descriptor.vcf_record.pos) - 1
@@ -134,7 +148,9 @@ def find_coordinates(
                 alt = "<DUP>"
                 change_length = end - start
             else:
-                raise ValueError(f"The copy number of {number} is not supported. Supported values: {{1, 3}}")
+                raise ValueError(
+                    f"The copy number of {number} is not supported. Supported values: {{1, 3}}"
+                )
 
             region = GenomicRegion(contig, start, end, Strand.POSITIVE)
             return VariantCoordinates(region, ref, alt, change_length)
@@ -197,7 +213,7 @@ def _looks_like_large_sv(
             else None
         )
 
-        # If we have these fields, we seem to have all information 
+        # If we have these fields, we seem to have all information
         # to parse the variation descriptor elsewhere.
         return structural_type is not None and gene_context is not None
 
@@ -214,7 +230,19 @@ def __init__(
         functional_annotator: FunctionalAnnotator,
         imprecise_sv_functional_annotator: ImpreciseSvFunctionalAnnotator,
         hgvs_coordinate_finder: VariantCoordinateFinder[str],
+        assume_karyotypic_sex: bool = True,
     ):
+        """
+        Create an instance.
+
+        :param build: the genome build to use for variants.
+        :param phenotype_creator: a phenotype creator for creating phenotypes.
+        :param functional_annotator: for computing functional annotations.
+        :param imprecise_sv_functional_annotator: for getting info about imprecise variants.
+        :param hgvs_coordinate_finder: for finding chromosomal coordinates for HGVS variant descriptions.
+        :param assume_karyotypic_sex: `True` if it is OK to assume that `FEMALE` has the `XX` karyotype
+            and `MALE` has `XY`.
+        """
         self._logger = logging.getLogger(__name__)
         # Violates DI, but it is specific to this class, so I'll leave it "as is".
         self._coord_finder = PhenopacketVariantCoordinateFinder(
@@ -228,49 +256,64 @@ def __init__(
             functional_annotator, FunctionalAnnotator, "functional_annotator"
         )
         self._imprecise_sv_functional_annotator = hpotk.util.validate_instance(
-            imprecise_sv_functional_annotator, ImpreciseSvFunctionalAnnotator, "imprecise_sv_functional_annotator"
+            imprecise_sv_functional_annotator,
+            ImpreciseSvFunctionalAnnotator,
+            "imprecise_sv_functional_annotator",
         )
-        
+
         # Set of sequence ontology IDs that we will treat as a deletion (`DEL`)
         # for the purpose of assigning imprecise SV info with a variant class.
         self._so_deletions = {
-            '1000029',  # chromosomal deletion: An incomplete chromosome.
-            '0001893',  # transcript ablation: A feature ablation whereby the deleted region includes a transcript feature.
-            '0001879',  # feature_ablation: A sequence variant, caused by an alteration of the genomic sequence, where the deletion, is greater than the extent of the underlying genomic features.
+            "1000029",  # chromosomal deletion: An incomplete chromosome.
+            # transcript ablation: A feature ablation whereby the deleted region includes a transcript feature.
+            "0001893",
+            # feature_ablation: A sequence variant, caused by an alteration of the genomic sequence,
+            # where the deletion, is greater than the extent of the underlying genomic features.
+            "0001879",
         }
         self._so_duplications = {
-            '1000037',  # chromosomal_duplication
+            "1000037",  # chromosomal_duplication
         }
+        self._assume_karyotypic_sex = assume_karyotypic_sex
 
-    def process(self, inputs: Phenopacket, notepad: Notepad) -> Patient:
+    def process(self, pp: Phenopacket, notepad: Notepad) -> Patient:
         """Creates a Patient from the data in a given Phenopacket
 
         Args:
-            inputs (Phenopacket): A Phenopacket object
+            pp (Phenopacket): A Phenopacket object
             notepad (Notepad): notepad to write down the issues
         Returns:
             Patient: A Patient object
         """
         sample_id = SampleLabels(
-            label=inputs.subject.id,
-            meta_label=inputs.id if len(inputs.id) > 0 else None,
+            label=pp.subject.id,
+            meta_label=pp.id if len(pp.id) > 0 else None,
         )
 
+        # Extract karyotypic sex
+        indi = notepad.add_subsection("individual")
+        sex = self._extract_sex(pp.subject, indi)
+
         # Check phenotypes
         pfs = notepad.add_subsection("phenotype-features")
         phenotypes = self._phenotype_creator.process(
-            ((pf.type.id, not pf.excluded) for pf in inputs.phenotypic_features), pfs
+            ((pf.type.id, not pf.excluded) for pf in pp.phenotypic_features), pfs
         )
 
         # Check diseases
-        diseases = self._add_diseases([dis for dis in inputs.diseases], pfs)
+        dip = notepad.add_subsection("diseases")
+        diseases = self._add_diseases(pp.diseases, dip)
 
         # Check variants
         vs = notepad.add_subsection("variants")
-        variants = self._add_variants(sample_id, inputs, vs)
-
-        return Patient(
-            sample_id, phenotypes=phenotypes, variants=variants, diseases=diseases
+        variants = self._add_variants(sample_id, pp, vs)
+
+        return Patient.from_raw_parts(
+            sample_id,
+            sex=sex,
+            phenotypes=phenotypes,
+            variants=variants,
+            diseases=diseases,
         )
 
     def _add_diseases(
@@ -285,21 +328,43 @@ def _add_diseases(
             Sequence[Dis]: A list of Disease objects
         """
         if len(diseases) == 0:
-            notepad.add_warning(f"No diseases found.")
-            return []
+            notepad.add_warning("No diseases found")
+            return ()
+
         final_diseases = []
         for i, dis in enumerate(diseases):
             if not dis.HasField("term"):
-                raise ValueError("Could not find term in Disease.")
-            term_id = hpotk.TermId.from_curie(dis.term.id)
+                notepad.add_error(f"#{i} has no `term`")
+                continue
+            else:
+                term_id = hpotk.TermId.from_curie(dis.term.id)
+
             # Do not include excluded diseases if we decide to assume excluded if not included
             final_diseases.append(Disease(term_id, dis.term.label, not dis.excluded))
+
         return final_diseases
 
+    def _extract_sex(
+        self,
+        individual: ppi.Individual,
+        notepad: Notepad,
+    ) -> typing.Optional[Sex]:
+        # Let's use the phenotypic sex as fallback
+        sex = individual.sex
+        if sex == ppi.FEMALE:
+            return Sex.FEMALE
+        elif sex == ppi.MALE:
+            return Sex.MALE
+        elif sex == ppi.OTHER_SEX or sex == ppi.UNKNOWN_SEX:
+            return Sex.UNKNOWN_SEX
+        else:
+            notepad.add_warning(f'Unknown sex type: {sex}')
+            return Sex.UNKNOWN_SEX
+
     def _add_variants(
-        self, 
-        sample_id: SampleLabels, 
-        pp: Phenopacket, 
+        self,
+        sample_id: SampleLabels,
+        pp: Phenopacket,
         notepad: Notepad,
     ) -> typing.Sequence[Variant]:
         """Creates a list of Variant objects from the data in a given Phenopacket
@@ -311,24 +376,28 @@ def _add_variants(
             Sequence[Variant]: A list of Variant objects
         """
         variants = []
-        
+
         for i, interpretation in enumerate(pp.interpretations):
-            sub_note = notepad.add_subsection(f'#{i}')
-            if interpretation.HasField('diagnosis'):
-                for genomic_interpretation in interpretation.diagnosis.genomic_interpretations:
+            sub_note = notepad.add_subsection(f"#{i}")
+            if interpretation.HasField("diagnosis"):
+                for (
+                    genomic_interpretation
+                ) in interpretation.diagnosis.genomic_interpretations:
                     gt = self._gt_parser.find_genotype(genomic_interpretation)
                     if gt is None:
                         sub_note.add_warning(
-                            f"Could not extract genotype from genomic interpretation",
+                            "Could not extract genotype from genomic interpretation",
                             "Remove variant from testing",
                         )
                         continue
-                    
-                    variant_info = self._extract_variant_info(sample_id, genomic_interpretation, sub_note)
+
+                    variant_info = self._extract_variant_info(
+                        sample_id, genomic_interpretation, sub_note
+                    )
                     if variant_info is None:
                         # We already complained in the extract function
                         continue
-                    
+
                     if variant_info.has_variant_coordinates():
                         try:
                             tx_annotations = self._functional_annotator.annotate(
@@ -342,8 +411,10 @@ def _add_variants(
                             continue
                     elif variant_info.has_sv_info():
                         try:
-                            tx_annotations = self._imprecise_sv_functional_annotator.annotate(
-                                item=variant_info.sv_info,
+                            tx_annotations = (
+                                self._imprecise_sv_functional_annotator.annotate(
+                                    item=variant_info.sv_info,
+                                )
                             )
                         except ValueError as error:
                             sub_note.add_warning(
@@ -352,23 +423,25 @@ def _add_variants(
                             )
                             continue
                     else:
-                        raise ValueError('VariantInfo should have either the coordinates or the SV info, but had neither!')
+                        raise ValueError(
+                            "VariantInfo should have either the coordinates or the SV info, but had neither!"
+                        )
 
                     if len(tx_annotations) == 0:
                         sub_note.add_warning(
                             f"Patient {pp.id} has an error with variant {variant_info.variant_key}",
-                            f"Remove variant from testing... tx_anno == 0",
+                            "Remove variant from testing... tx_anno == 0",
                         )
                         continue
 
-                    
                     genotype = Genotypes.single(sample_id, gt)
                     variants.append(
                         Variant(
-                            variant_info=variant_info, 
-                            tx_annotations=tx_annotations, 
+                            variant_info=variant_info,
+                            tx_annotations=tx_annotations,
                             genotypes=genotype,
-                        ))
+                        )
+                    )
 
         if len(variants) == 0:
             notepad.add_warning(f"Patient {pp.id} has no variants to work with")
@@ -385,39 +458,50 @@ def _extract_variant_info(
         sv_info = None
 
         try:
-            variant_coordinates = self._coord_finder.find_coordinates(genomic_interpretation)
+            variant_coordinates = self._coord_finder.find_coordinates(
+                genomic_interpretation
+            )
         except ValueError:
             notepad.add_warning(
-                f"Expected a VCF record, a VRS CNV, or an expression with `hgvs.c` but had an error retrieving any from patient {sample_id}",
+                "Expected a VCF record, a VRS CNV, or an expression with `hgvs.c`"
+                f"but had an error retrieving any from patient {sample_id}",
                 "Remove variant from testing",
             )
             return None
-        
+
         if variant_coordinates is None:
             sv_info = self._map_to_imprecise_sv(genomic_interpretation)
             if sv_info is None:
                 notepad.add_warning(
-                    f"Could not extract the information for large SV annotation",
+                    "Could not extract the information for large SV annotation",
                     "Remove variant from testing",
                 )
                 return None
-        
+
         return VariantInfo(
             variant_coordinates=variant_coordinates,
             sv_info=sv_info,
         )
 
     def _map_to_imprecise_sv(
-        self, 
+        self,
         genomic_interpretation: GenomicInterpretation,
     ) -> typing.Optional[ImpreciseSvInfo]:
-        if genomic_interpretation.HasField('variant_interpretation'):
+        if genomic_interpretation.HasField("variant_interpretation"):
             variant_interpretation = genomic_interpretation.variant_interpretation
-            if variant_interpretation.HasField('variation_descriptor'):
+            if variant_interpretation.HasField("variation_descriptor"):
                 variation_descriptor = variant_interpretation.variation_descriptor
-                
-                structural_type = variation_descriptor.structural_type if variation_descriptor.HasField('structural_type') else None
-                gene_context = variation_descriptor.gene_context if variation_descriptor.HasField('gene_context') else None
+
+                structural_type = (
+                    variation_descriptor.structural_type
+                    if variation_descriptor.HasField("structural_type")
+                    else None
+                )
+                gene_context = (
+                    variation_descriptor.gene_context
+                    if variation_descriptor.HasField("gene_context")
+                    else None
+                )
 
                 if structural_type is not None and gene_context is not None:
                     st = hpotk.TermId.from_curie(curie=structural_type.id)
@@ -428,23 +512,23 @@ def _map_to_imprecise_sv(
                         gene_id=gene_context.value_id,
                         gene_symbol=gene_context.symbol,
                     )
-                
+
         return None
-    
+
     def _map_structural_type_to_variant_class(
         self,
         structural_type: hpotk.TermId,
     ) -> VariantClass:
         # This method is most likely incomplete.
-        # Please open a ticket if you receive a `ValueError` 
+        # Please open a ticket if you receive a `ValueError`
         # for a structural type, that is not mapped at the moment,
         # to help us enhance the mapping.
-        if structural_type.prefix == 'SO':
+        if structural_type.prefix == "SO":
             if structural_type.id in self._so_deletions:
                 return VariantClass.DEL
             elif structural_type.id in self._so_duplications:
                 return VariantClass.DUP
             else:
-                raise ValueError(f'Unknown structural type {structural_type}')
+                raise ValueError(f"Unknown structural type {structural_type}")
         else:
-            raise ValueError(f'Unknown structural type {structural_type}')
+            raise ValueError(f"Unknown structural type {structural_type}")
diff --git a/tests/analysis/conftest.py b/tests/analysis/conftest.py
index 86dd01c43..90b46c8d8 100644
--- a/tests/analysis/conftest.py
+++ b/tests/analysis/conftest.py
@@ -35,8 +35,9 @@ def degenerated_cohort(
 
     return Cohort.from_patients(
         members=(
-            Patient(
+            Patient.from_raw_parts(
                 labels=labels_a,
+                sex=Sex.UNKNOWN_SEX,
                 phenotypes=(
                     Phenotype(
                         term_id=hpotk.TermId.from_curie("HP:0000118"),
@@ -69,8 +70,9 @@ def degenerated_cohort(
                     ),
                 ),
             ),
-            Patient(
+            Patient.from_raw_parts(
                 labels=labels_b,
+                sex=Sex.UNKNOWN_SEX,
                 phenotypes=(
                     Phenotype(
                         term_id=hpotk.TermId.from_curie("HP:0000118"),
diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py
index 22b6d5882..3043d3419 100644
--- a/tests/analysis/predicate/genotype/conftest.py
+++ b/tests/analysis/predicate/genotype/conftest.py
@@ -187,8 +187,9 @@ def patient_w_missense(
     sample_labels: SampleLabels,
     missense_variant: Variant,
 ) -> Patient:
-    return Patient(
+    return Patient.from_raw_parts(
         labels=sample_labels,
+        sex=Sex.UNKNOWN_SEX,
         phenotypes=(),
         diseases=(),
         variants=(
@@ -202,8 +203,9 @@ def patient_w_frameshift(
     sample_labels: SampleLabels,
     frameshift_variant: Variant,
 ) -> Patient:
-    return Patient(
+    return Patient.from_raw_parts(
         labels=sample_labels,
+        sex=Sex.UNKNOWN_SEX,
         phenotypes=(),
         diseases=(),
         variants=(
diff --git a/tests/analysis/predicate/genotype/test_allele_counter.py b/tests/analysis/predicate/genotype/test_allele_counter.py
index 0b4355012..506ec3023 100644
--- a/tests/analysis/predicate/genotype/test_allele_counter.py
+++ b/tests/analysis/predicate/genotype/test_allele_counter.py
@@ -175,8 +175,9 @@ def patient(
         hom_alt_lmna: Variant,
         hemi_dmd: Variant,
     ) -> Patient:
-        return Patient(
+        return Patient.from_raw_parts(
             labels=sample_labels,
+            sex=Sex.UNKNOWN_SEX,
             variants=(
                 het_lmna,
                 hom_alt_lmna,
diff --git a/tests/analysis/test_pscore.py b/tests/analysis/test_pscore.py
index 9b6329029..a4c1c24be 100644
--- a/tests/analysis/test_pscore.py
+++ b/tests/analysis/test_pscore.py
@@ -3,7 +3,7 @@
 import hpotk
 import pytest
 
-from gpsea.model import Patient, Phenotype, SampleLabels
+from gpsea.model import Patient, Phenotype, SampleLabels, Sex
 from gpsea.analysis.pscore import CountingPhenotypeScorer
 
 
@@ -50,8 +50,9 @@ def test_a_patient(
         expected: int,
         counting_scorer: CountingPhenotypeScorer,
     ):
-        patient = Patient(
+        patient = Patient.from_raw_parts(
             labels=SampleLabels("test"),
+            sex=Sex.UNKNOWN_SEX,
             phenotypes=(
                 Phenotype(
                     hpotk.TermId.from_curie(curie),
diff --git a/tests/conftest.py b/tests/conftest.py
index 54f012ecc..b66b55e7d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -268,42 +268,53 @@ def toy_cohort(
         genotypes=Genotypes.from_mapping({SampleLabels('LargeCNV'): Genotype.HETEROZYGOUS}))
 
     patients = (
-        Patient(SampleLabels('HetSingleVar'),
-                phenotypes=(
-                    test_phenotypes['arachnodactyly_T'],
-                    test_phenotypes['spasticity_F'],
-                    test_phenotypes['focal_clonic_seizure_T']),
-                variants=(dup,),
-                diseases=(test_diseases['KBG_T'],)
-                ),
-        Patient(SampleLabels('HetDoubleVar1'),
-                phenotypes=(
-                    test_phenotypes['arachnodactyly_T'], test_phenotypes['seizure_T'], test_phenotypes['spasticity_T'],
-                ),
-                variants=(indel, snv_stop_gain),
-                diseases=(test_diseases['KBG_T'],)
-                ),
-        Patient(SampleLabels('HetDoubleVar2'),
-                phenotypes=(
-                    test_phenotypes['arachnodactyly_F'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'],
-                ),
-                variants=(snv_missense, del_frameshift),
-                diseases=(test_diseases['KBG_T'],)
-                ),
-        Patient(SampleLabels('HomoVar'),
-                phenotypes=(
-                    test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'],
-                ),
-                variants=(del_small,),
-                diseases=()
-                ),
-        Patient(SampleLabels('LargeCNV'),
-                phenotypes=(
-                    test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_F'],
-                ),
-                variants=(del_large,),
-                diseases=()
-                ),
+        Patient.from_raw_parts(
+            SampleLabels('HetSingleVar'),
+            sex=Sex.UNKNOWN_SEX,
+            phenotypes=(
+                test_phenotypes['arachnodactyly_T'],
+                test_phenotypes['spasticity_F'],
+                test_phenotypes['focal_clonic_seizure_T']
+            ),
+            variants=(dup,),
+            diseases=(test_diseases['KBG_T'],)
+        ),
+        Patient.from_raw_parts(
+            SampleLabels('HetDoubleVar1'),
+            sex=Sex.UNKNOWN_SEX,
+            phenotypes=(
+                test_phenotypes['arachnodactyly_T'], test_phenotypes['seizure_T'], test_phenotypes['spasticity_T'],
+            ),
+            variants=(indel, snv_stop_gain),
+            diseases=(test_diseases['KBG_T'],)
+        ),
+        Patient.from_raw_parts(
+            SampleLabels('HetDoubleVar2'),
+            sex=Sex.UNKNOWN_SEX,        
+            phenotypes=(
+                test_phenotypes['arachnodactyly_F'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'],
+            ),
+            variants=(snv_missense, del_frameshift),
+            diseases=(test_diseases['KBG_T'],)
+        ),
+        Patient.from_raw_parts(
+            SampleLabels('HomoVar'),
+            sex=Sex.UNKNOWN_SEX,
+            phenotypes=(
+                test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'],
+            ),
+            variants=(del_small,),
+            diseases=()
+        ),
+        Patient.from_raw_parts(
+            SampleLabels('LargeCNV'),
+            sex=Sex.UNKNOWN_SEX,
+            phenotypes=(
+                test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_F'],
+            ),
+            variants=(del_large,),
+            diseases=()
+        ),
     )
 
     return Cohort.from_patients(patients)
diff --git a/tests/test_data/SUOX.json b/tests/test_data/SUOX.json
index d85873adc..083f77dd6 100644
--- a/tests/test_data/SUOX.json
+++ b/tests/test_data/SUOX.json
@@ -5,6 +5,7 @@
         "label": "individual_10_PMID_12112661",
         "meta_label": "PMID_36303223_individual_10_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -141,6 +142,7 @@
         "label": "individual_11_PMID_12112661",
         "meta_label": "PMID_36303223_individual_11_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -392,6 +394,7 @@
         "label": "individual_12_PMID_12112661",
         "meta_label": "PMID_36303223_individual_12_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -528,6 +531,7 @@
         "label": "individual_13_PMID_12112661",
         "meta_label": "PMID_36303223_individual_13_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -664,6 +668,7 @@
         "label": "individual_14_PMID_11825068",
         "meta_label": "PMID_36303223_individual_14_PMID_11825068"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0034332",
@@ -840,6 +845,7 @@
         "label": "individual_15_PMID_12001203",
         "meta_label": "PMID_36303223_individual_15_PMID_12001203"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -1000,6 +1006,7 @@
         "label": "individual_16_PMID_12368985",
         "meta_label": "PMID_36303223_individual_16_PMID_12368985"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -1283,6 +1290,7 @@
         "label": "individual_17_PMID_15952210",
         "meta_label": "PMID_36303223_individual_17_PMID_15952210"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0012758",
@@ -1459,6 +1467,7 @@
         "label": "individual_18_PMID_23414711",
         "meta_label": "PMID_36303223_individual_18_PMID_23414711"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0012758",
@@ -1639,6 +1648,7 @@
         "label": "individual_19_PMID_23452914",
         "meta_label": "PMID_36303223_individual_19_PMID_23452914"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0002071",
@@ -1827,6 +1837,7 @@
         "label": "individual_1_PMID_9050047",
         "meta_label": "PMID_36303223_individual_1_PMID_9050047"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0012758",
@@ -1999,6 +2010,7 @@
         "label": "individual_20_PMID_24938149",
         "meta_label": "PMID_36303223_individual_20_PMID_24938149"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -2270,6 +2282,7 @@
         "label": "individual_21_PMID_27289259",
         "meta_label": "PMID_36303223_individual_21_PMID_27289259"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0012758",
@@ -2442,6 +2455,7 @@
         "label": "individual_22_PMID_27289259",
         "meta_label": "PMID_36303223_individual_22_PMID_27289259"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0012758",
@@ -2614,6 +2628,7 @@
         "label": "individual_23_PMID_28725568",
         "meta_label": "PMID_36303223_individual_23_PMID_28725568"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -2897,6 +2912,7 @@
         "label": "individual_24_PMID_28629418",
         "meta_label": "PMID_36303223_individual_24_PMID_28629418"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -3065,6 +3081,7 @@
         "label": "individual_25_PMID_31870341",
         "meta_label": "PMID_36303223_individual_25_PMID_31870341"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0034332",
@@ -3348,6 +3365,7 @@
         "label": "individual_26_PMID_31870341",
         "meta_label": "PMID_36303223_individual_26_PMID_31870341"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0034332",
@@ -3631,6 +3649,7 @@
         "label": "individual_27_PMID_31870341",
         "meta_label": "PMID_36303223_individual_27_PMID_31870341"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0034332",
@@ -3914,6 +3933,7 @@
         "label": "individual_28_PMID_33335014",
         "meta_label": "PMID_36303223_individual_28_PMID_33335014"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0012758",
@@ -4086,6 +4106,7 @@
         "label": "individual_29_PMID_33335014",
         "meta_label": "PMID_36303223_individual_29_PMID_33335014"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0012758",
@@ -4262,6 +4283,7 @@
         "label": "individual_2_PMID_9600976",
         "meta_label": "PMID_36303223_individual_2_PMID_9600976"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0012758",
@@ -4434,6 +4456,7 @@
         "label": "individual_30_PMID_31806255",
         "meta_label": "PMID_36303223_individual_30_PMID_31806255"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0034332",
@@ -4622,6 +4645,7 @@
         "label": "individual_31_PMID_31806255",
         "meta_label": "PMID_36303223_individual_31_PMID_31806255"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0034332",
@@ -4810,6 +4834,7 @@
         "label": "individual_32_PMID_34025712",
         "meta_label": "PMID_36303223_individual_32_PMID_34025712"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -5097,6 +5122,7 @@
         "label": "individual_33_PMID_33405344",
         "meta_label": "PMID_36303223_individual_33_PMID_33405344"
       },
+      "sex": "FEMALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -5368,6 +5394,7 @@
         "label": "individual_34_PMID_33405344",
         "meta_label": "PMID_36303223_individual_34_PMID_33405344"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -5639,6 +5666,7 @@
         "label": "individual_35_PMID_36303223",
         "meta_label": "PMID_36303223_individual_35_PMID_36303223"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0001083",
@@ -5938,6 +5966,7 @@
         "label": "individual_3_PMID_10519592",
         "meta_label": "PMID_36303223_individual_3_PMID_10519592"
       },
+      "sex": "MALE",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -6229,6 +6258,7 @@
         "label": "individual_4_PMID_12112661",
         "meta_label": "PMID_36303223_individual_4_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -6365,6 +6395,7 @@
         "label": "individual_5_PMID_12112661",
         "meta_label": "PMID_36303223_individual_5_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -6616,6 +6647,7 @@
         "label": "individual_6_PMID_12112661",
         "meta_label": "PMID_36303223_individual_6_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -6752,6 +6784,7 @@
         "label": "individual_7_PMID_12112661",
         "meta_label": "PMID_36303223_individual_7_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -6888,6 +6921,7 @@
         "label": "individual_8_PMID_12112661",
         "meta_label": "PMID_36303223_individual_8_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
@@ -7024,6 +7058,7 @@
         "label": "individual_9_PMID_12112661",
         "meta_label": "PMID_36303223_individual_9_PMID_12112661"
       },
+      "sex": "UNKNOWN_SEX",
       "phenotypes": [
         {
           "term_id": "HP:0001250",
diff --git a/tests/test_io.py b/tests/test_io.py
index a479dceed..7d45cabdd 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -24,7 +24,6 @@ def test_regenerate_cohort(
 ):
     """
     The test for regenerating the `SUOX.json` file based on a cohort of phenopackets.
-    The test needs path to a folder with phenopacket JSON files (empty `str` below).
 
     Note, the test may need to be run multiple times if the ENSEMBL API times out.
     """

From 202be91a8acf4141f79f5e413b32054a125dfd20 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Thu, 29 Aug 2024 15:10:08 +0200
Subject: [PATCH 04/16] Implement `ModeOfInheritanceParedicate`.

---
 src/gpsea/analysis/predicate/_api.py          |  31 +-
 .../analysis/predicate/genotype/__init__.py   |   2 +
 .../predicate/genotype/_gt_predicates.py      | 424 ++++++++++++++++--
 src/gpsea/model/_base.py                      |  24 +
 tests/analysis/predicate/genotype/conftest.py | 400 ++++++++++++++++-
 .../predicate/genotype/test_gt_predicates.py  | 104 ++++-
 6 files changed, 941 insertions(+), 44 deletions(-)

diff --git a/src/gpsea/analysis/predicate/_api.py b/src/gpsea/analysis/predicate/_api.py
index 8c62e1910..da446a546 100644
--- a/src/gpsea/analysis/predicate/_api.py
+++ b/src/gpsea/analysis/predicate/_api.py
@@ -1,6 +1,8 @@
 import abc
 import typing
 
+from collections import Counter
+
 import hpotk.util
 
 from gpsea.model import Patient
@@ -136,9 +138,7 @@ class PolyPredicate(typing.Generic[C], metaclass=abc.ABCMeta):
     and *exhaustive* - the groups must cover all possible scenarios.
 
     However, if the patient cannot be assigned into any meaningful category, `None` can be returned.
-
-    Note, that `PolyPredicate` must be used on a happy path - testing a patient must inherently make sense.
-    Predicate will *not* check if, for instance, the patient variants are compatible with a certain mode of inheritance.
+    As a rule of thumb, returning `None` will exclude the patient from the analysis.
     """
 
     @abc.abstractmethod
@@ -210,3 +210,28 @@ def _check_patient(patient: Patient):
         """
         if not isinstance(patient, Patient):
             raise ValueError(f"patient must be type Patient but was type {type(patient)}")
+
+    @staticmethod
+    def _check_categorizations(
+        categorizations: typing.Sequence[Categorization],
+    ) -> typing.Sequence[str]:
+        """
+        Check that the categorizations meet the requirements.
+
+        The requirements include:
+
+        * the `cat_id` must be unique within the predicate
+        """
+        issues = []
+        # There must be at least one category
+
+        # `cat_id` must be unique for a predicate!
+        cat_id_counts = Counter()
+        for c in categorizations:
+            cat_id_counts[c.category.cat_id] += 1
+
+        for cat_id, count in cat_id_counts.items():
+            if count > 1:
+                issues.append(f'`cat_id` {cat_id} is present {count}>1 times')
+        
+        return issues
diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py
index a8a2ba919..ff8d16e1b 100644
--- a/src/gpsea/analysis/predicate/genotype/__init__.py
+++ b/src/gpsea/analysis/predicate/genotype/__init__.py
@@ -2,11 +2,13 @@
 from ._api import VariantPredicate
 from ._counter import AlleleCounter
 from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate
+from ._gt_predicates import ModeOfInheritancePredicate
 from ._variant import VariantPredicates, ProteinPredicates
 
 __all__ = [
     'GenotypePolyPredicate',
     'boolean_predicate', 'groups_predicate', 'recessive_predicate',
+    'ModeOfInheritancePredicate',
     'AlleleCounter', 'VariantPredicate',
     'VariantPredicates', 'ProteinPredicates',
 ]
diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index 3f54a7e02..601aa70bc 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -1,8 +1,12 @@
+import dataclasses
+import enum
 import typing
 
-from gpsea.model import Patient
+from collections import defaultdict
 
-from .._api import Categorization, PatientCategories
+from gpsea.model import Patient, Sex
+
+from .._api import Categorization, PatientCategory, PatientCategories
 from ._api import GenotypePolyPredicate, RecessiveGroupingPredicate
 from ._api import VariantPredicate
 from ._counter import AlleleCounter
@@ -30,7 +34,10 @@ def get_categorizations(self) -> typing.Sequence[Categorization]:
         :attr:`AlleleCountingGenotypeBooleanPredicate.NO`
         or :class:`AlleleCountingGenotypeBooleanPredicate.YES` category.
         """
-        return AlleleCountingGenotypeBooleanPredicate.YES, AlleleCountingGenotypeBooleanPredicate.NO
+        return (
+            AlleleCountingGenotypeBooleanPredicate.YES,
+            AlleleCountingGenotypeBooleanPredicate.NO,
+        )
 
     def get_question(self) -> str:
         return self._allele_counter.get_question()
@@ -49,15 +56,17 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]:
             )
 
     def __eq__(self, value: object) -> bool:
-        return isinstance(value, AlleleCountingGenotypeBooleanPredicate) \
+        return (
+            isinstance(value, AlleleCountingGenotypeBooleanPredicate)
             and self._allele_counter == value._allele_counter
-    
+        )
+
     def __hash__(self) -> int:
         return hash((self._allele_counter,))
-    
+
     def __str__(self) -> str:
-        return f'AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})'
-    
+        return f"AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})"
+
     def __repr__(self) -> str:
         return str(self)
 
@@ -83,8 +92,8 @@ def __init__(
     ):
         self._counters = tuple(counters)
         self._categorizations = tuple(categorizations)
-        group_names = ', '.join(c.category.name for c in self._categorizations)
-        self._question = f'Genotype group: {group_names}'
+        group_names = ", ".join(c.category.name for c in self._categorizations)
+        self._question = f"Genotype group: {group_names}"
 
     def get_categorizations(self) -> typing.Sequence[Categorization]:
         return self._categorizations
@@ -108,20 +117,29 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]:
             return None
 
     def __eq__(self, value: object) -> bool:
-        return isinstance(value, AlleleCountingGroupsPredicate) \
-            and self._counters == value._counters \
+        return (
+            isinstance(value, AlleleCountingGroupsPredicate)
+            and self._counters == value._counters
             and self._categorizations == value._categorizations
+        )
 
     def __hash__(self) -> int:
-        return hash((self._counters, self._categorizations,))
+        return hash(
+            (
+                self._counters,
+                self._categorizations,
+            )
+        )
 
     def __str__(self) -> str:
         return self.get_question()
 
     def __repr__(self) -> str:
-        return 'AlleleCountingGroupsPredicate(' \
-            + 'counters={self._counters}, ' \
-            + 'categorizations={self._categorizations})'
+        return (
+            "AlleleCountingGroupsPredicate("
+            + "counters={self._counters}, "
+            + "categorizations={self._categorizations})"
+        )
 
 
 def groups_predicate(
@@ -141,9 +159,10 @@ def groups_predicate(
     predicates = tuple(predicates)
     group_names = tuple(group_names)
 
-    assert len(predicates) >= 2, f'We need at least 2 predicates: {len(predicates)}'
-    assert len(predicates) == len(group_names), \
-        f'The number of group names must match the number of predicates: {len(group_names)}!={len(predicates)}'
+    assert len(predicates) >= 2, f"We need at least 2 predicates: {len(predicates)}"
+    assert len(predicates) == len(
+        group_names
+    ), f"The number of group names must match the number of predicates: {len(group_names)}!={len(predicates)}"
 
     # Then, prepare the counters and categorizations.
     counters = [AlleleCounter(predicate=predicate) for predicate in predicates]
@@ -170,7 +189,7 @@ class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate):
     #  Therefore, I do not write any tests at this point.
 
     def __init__(
-        self, 
+        self,
         allele_counter: AlleleCounter,
     ):
         self._allele_counter = allele_counter
@@ -190,16 +209,21 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]:
             return RecessiveGroupingPredicate.BOTH
         else:
             return None
-    
+
     def __eq__(self, value: object) -> bool:
-        return isinstance(value, AlleleCountingRecessivePredicate) and self._allele_counter == value._allele_counter
-    
+        return (
+            isinstance(value, AlleleCountingRecessivePredicate)
+            and self._allele_counter == value._allele_counter
+        )
+
     def __hash__(self) -> int:
         return hash((self._allele_counter,))
-    
+
     def __str__(self) -> str:
-        return f'AlleleCountingRecessivePredicate(allele_counter={self._allele_counter})'
-    
+        return (
+            f"AlleleCountingRecessivePredicate(allele_counter={self._allele_counter})"
+        )
+
     def __repr__(self) -> str:
         return str(self)
 
@@ -209,8 +233,8 @@ def recessive_predicate(
 ) -> GenotypePolyPredicate:
     """
     Create a recessive grouping predicate from given `variant_predicate`
-    to bin the patient into :class:`RecessiveGroupingPredicate.NEITHER`, 
-    :class:`RecessiveGroupingPredicate.ONE`, or :class:`RecessiveGroupingPredicate.BOTH`, 
+    to bin the patient into :class:`RecessiveGroupingPredicate.NEITHER`,
+    :class:`RecessiveGroupingPredicate.ONE`, or :class:`RecessiveGroupingPredicate.BOTH`,
     depending on the number of variant alleles matching the variant predicate.
 
     The patient is assigned into a group in the following manner:
@@ -221,3 +245,347 @@ def recessive_predicate(
     """
     allele_counter = AlleleCounter(predicate=variant_predicate)
     return AlleleCountingRecessivePredicate(allele_counter=allele_counter)
+
+
+@dataclasses.dataclass(eq=True, frozen=True)
+class GenotypeGroup:
+    allele_count: int
+    sex: typing.Optional[Sex]
+    categorization: Categorization
+
+
+class MendelianInheritanceAspect(enum.Enum):
+    AUTOSOMAL = 0
+    """
+    Related to chromosomes that do *not* determine the sex of an individual.
+    """
+    
+    GONOSOMAL = 1
+    """
+    Related to chromosomes that determine the sex of an individual.
+    """
+    
+    MITOCHONDRIAL = 2
+    """
+    Related to mitochondrial DNA.
+    """
+
+
+class ModeOfInheritanceInfo:
+
+    @staticmethod
+    def autosomal_dominant() -> "ModeOfInheritanceInfo":
+        groups = (
+            GenotypeGroup(
+                allele_count=0,
+                sex=None,
+                categorization=Categorization(
+                    PatientCategory(
+                        cat_id=0, name="0/0", description="Homozygous reference"
+                    ),
+                ),
+            ),
+            GenotypeGroup(
+                allele_count=1,
+                sex=None,
+                categorization=Categorization(
+                    PatientCategory(cat_id=1, name="0/1", description="Heterozygous"),
+                ),
+            ),
+        )
+        return ModeOfInheritanceInfo(
+            mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL,
+            groups=groups,
+        )
+
+    @staticmethod
+    def autosomal_recessive() -> "ModeOfInheritanceInfo":
+        groups = (
+            GenotypeGroup(
+                allele_count=0,
+                sex=None,
+                categorization=Categorization(
+                    PatientCategory(
+                        cat_id=0, name="0/0", description="Homozygous reference"
+                    ),
+                ),
+            ),
+            GenotypeGroup(
+                allele_count=1,
+                sex=None,
+                categorization=Categorization(
+                    PatientCategory(cat_id=1, name="0/1", description="Heterozygous"),
+                ),
+            ),
+            GenotypeGroup(
+                allele_count=2,
+                sex=None,
+                categorization=Categorization(
+                    PatientCategory(cat_id=2, name="1/1", description="Homozygous alternate"),
+                ),
+            ),
+        )
+        return ModeOfInheritanceInfo(
+            mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL,
+            groups=groups,
+        )
+
+    @staticmethod
+    def x_dominant() -> "ModeOfInheritanceInfo":
+        groups = (
+            GenotypeGroup(
+                allele_count=0,
+                sex=None,
+                categorization=Categorization(
+                    PatientCategory(
+                        cat_id=0, name="0", description="Homozygous reference"
+                    ),
+                ),
+            ),
+            GenotypeGroup(
+                allele_count=1,
+                sex=Sex.FEMALE,
+                categorization=Categorization(
+                    PatientCategory(cat_id=1, name="0/1", description="Heterozygous"),
+                ),
+            ),
+            GenotypeGroup(
+                allele_count=1,
+                sex=Sex.MALE,
+                categorization=Categorization(
+                    PatientCategory(cat_id=2, name="1", description="Hemizygous"),
+                ),
+            ),
+        )
+        return ModeOfInheritanceInfo(
+            mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL,
+            groups=groups,
+        )
+
+    @staticmethod
+    def x_recessive() -> "ModeOfInheritanceInfo":
+        groups = (
+            GenotypeGroup(
+                allele_count=0,
+                sex=None,
+                categorization=Categorization(
+                    PatientCategory(
+                        cat_id=0, name="0/0", description="Homozygous reference"
+                    ),
+                ),
+            ),
+            GenotypeGroup(
+                allele_count=1,
+                sex=Sex.FEMALE,
+                categorization=Categorization(
+                    PatientCategory(cat_id=1, name="0/1", description="Heterozygous"),
+                ),
+            ),
+            GenotypeGroup(
+                allele_count=1,
+                sex=Sex.MALE,
+                categorization=Categorization(
+                    PatientCategory(cat_id=2, name="1", description="Hemizygous"),
+                ),
+            ),
+        )
+
+        return ModeOfInheritanceInfo(
+            mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL,
+            groups=groups,
+        )
+
+    def __init__(
+        self,
+        mendelian_inheritance_aspect: MendelianInheritanceAspect,
+        groups: typing.Iterable[GenotypeGroup],
+    ):
+        # We pre-compute the hash manually.
+        # The correctness depends on two default dicts with same keys and values
+        # comparing equal.
+        hash_value = 17
+        assert isinstance(mendelian_inheritance_aspect, MendelianInheritanceAspect)
+        self._aspect = mendelian_inheritance_aspect
+
+        hash_value += 31 * hash(self._aspect)
+
+        self._groups = defaultdict(list)
+        for group in groups:
+            assert isinstance(group, GenotypeGroup)
+            self._groups[group.allele_count].append(group)
+            hash_value += 13 * hash(group)
+            
+        self._hash = hash_value
+
+    @property
+    def groups(self) -> typing.Iterator[GenotypeGroup]:
+        # Flatten `values()` which is an iterable of lists.
+        return (group for meta_group in self._groups.values() for group in meta_group)
+
+    @property
+    def mendelian_inheritance_aspect(self) -> MendelianInheritanceAspect:
+        return self._aspect
+
+    def get_groups_for_allele_count(
+        self,
+        allele_count: int,
+    ) -> typing.Sequence[GenotypeGroup]:
+        try:
+            return self._groups[allele_count]
+        except KeyError:
+            # No group for this allele count is OK
+            return ()
+
+    def is_autosomal(self) -> bool:
+        return self._aspect == MendelianInheritanceAspect.AUTOSOMAL
+
+    def is_gonosomal(self) -> bool:
+        return self._aspect == MendelianInheritanceAspect.GONOSOMAL
+
+    def is_mitochondrial(self) -> bool:
+        return self._aspect == MendelianInheritanceAspect.MITOCHONDRIAL
+
+    def __eq__(self, value: object) -> bool:
+        return (
+            isinstance(value, ModeOfInheritanceInfo)
+            and self._aspect == value._aspect
+            and self._groups == value._groups
+        )
+
+    def __hash__(self) -> int:
+        return self._hash
+
+    def __str__(self) -> str:
+        return f"ModeOfInheritanceInfo(aspect={self._aspect}, groups={self._groups})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
+class ModeOfInheritancePredicate(GenotypePolyPredicate):
+
+    @staticmethod
+    def autosomal_dominant(
+        variant_predicate: VariantPredicate,
+    ) -> "ModeOfInheritancePredicate":
+        return ModeOfInheritancePredicate.from_moi_info(
+            variant_predicate=variant_predicate,
+            mode_of_inheritance_info=ModeOfInheritanceInfo.autosomal_dominant(),
+        )
+    
+    @staticmethod
+    def autosomal_recessive(
+        variant_predicate: VariantPredicate,
+    ) -> "ModeOfInheritancePredicate":
+        return ModeOfInheritancePredicate.from_moi_info(
+            variant_predicate=variant_predicate,
+            mode_of_inheritance_info=ModeOfInheritanceInfo.autosomal_recessive(),
+        )
+    
+    @staticmethod
+    def x_dominant(
+        variant_predicate: VariantPredicate,
+    ) -> "ModeOfInheritancePredicate":
+        return ModeOfInheritancePredicate.from_moi_info(
+            variant_predicate=variant_predicate,
+            mode_of_inheritance_info=ModeOfInheritanceInfo.x_dominant(),
+        )
+
+    @staticmethod
+    def x_recessive(
+        variant_predicate: VariantPredicate,
+    ) -> "ModeOfInheritancePredicate":
+        return ModeOfInheritancePredicate.from_moi_info(
+            variant_predicate=variant_predicate,
+            mode_of_inheritance_info=ModeOfInheritanceInfo.x_recessive(),
+        )
+
+    @staticmethod
+    def from_moi_info(
+        variant_predicate: VariantPredicate,
+        mode_of_inheritance_info: ModeOfInheritanceInfo,
+    ) -> "ModeOfInheritancePredicate":
+        allele_counter = AlleleCounter(predicate=variant_predicate)
+        return ModeOfInheritancePredicate(
+            allele_counter=allele_counter,
+            mode_of_inheritance_info=mode_of_inheritance_info,
+        )
+
+    def __init__(
+        self,
+        allele_counter: AlleleCounter,
+        mode_of_inheritance_info: ModeOfInheritanceInfo,
+    ):
+        assert isinstance(allele_counter, AlleleCounter)
+        self._allele_counter = allele_counter
+
+        assert isinstance(mode_of_inheritance_info, ModeOfInheritanceInfo)
+        self._moi_info = mode_of_inheritance_info
+
+        self._categorizations = tuple(group.categorization for group in mode_of_inheritance_info.groups)
+        issues = ModeOfInheritancePredicate._check_categorizations(self._categorizations)
+        if issues:
+            raise ValueError('Cannot create predicate: {}'.format(', '.join(issues)))
+        self._question = 'Which genotype group does the patient fit in: {}'.format(
+            ', '.join(cat.category.name for cat in self._categorizations),
+        )
+
+    def get_categorizations(self) -> typing.Sequence[Categorization]:
+        return self._categorizations
+
+    def get_question(self) -> str:
+        return self._question
+
+    def test(
+        self,
+        patient: Patient,
+    ) -> typing.Optional[Categorization]:
+        self._check_patient(patient)
+
+        if self._moi_info.is_autosomal():
+            allele_count = self._allele_counter.count(patient)
+            groups = self._moi_info.get_groups_for_allele_count(allele_count)
+            if len(groups) == 1:
+                return groups[0].categorization
+            else:
+                return None
+        elif self._moi_info.is_gonosomal():
+            if patient.sex.is_provided():
+                allele_count = self._allele_counter.count(patient)
+                groups = self._moi_info.get_groups_for_allele_count(allele_count)
+                for group in groups:
+                    if group.sex is not None and group.sex == patient.sex:
+                        return group.categorization
+                return None
+            else:
+                # We must have patient's sex
+                # to do any meaningful analysis
+                # in the non-autosomal scenario.
+                return None
+
+        elif self._moi_info.is_mitochondrial():
+            # Cannot deal with mitochondrial inheritance right now.
+            return None
+        else:
+            # Bug, please report to the developers
+            raise ValueError("Unexpected mode of inheritance condition")
+
+    def __eq__(self, value: object) -> bool:
+        return (
+            isinstance(value, ModeOfInheritancePredicate)
+            and self._allele_counter == value._allele_counter
+            and self._moi_info == value._moi_info
+        )
+
+    def __hash__(self) -> int:
+        return hash((self._allele_counter, self._moi_info,))
+
+    def __str__(self) -> str:
+        return (
+            "ModeOfInheritancePredicate("
+            f"allele_counter={self._allele_counter}, "
+            f"moi_info={self._moi_info})"
+        )
+
+    def __repr__(self) -> str:
+        return str(self)
diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py
index 0effeab06..0d541a65d 100644
--- a/src/gpsea/model/_base.py
+++ b/src/gpsea/model/_base.py
@@ -26,6 +26,30 @@ class Sex(enum.Enum):
     Male sex. Maps to `NCIT:C46112`.
     """
 
+    def is_provided(self) -> bool:
+        """
+        Return `True` if the sex is a known value, such as `FEMALE` or `MALE`.
+        """
+        return self != Sex.UNKNOWN_SEX
+
+    def is_unknown(self) -> bool:
+        """
+        Return `True` if this is an `UNKNOWN_SEX`.
+        """
+        return self == Sex.UNKNOWN_SEX
+
+    def is_female(self) -> bool:
+        """
+        Return `True` if the sex represents a `FEMALE`.
+        """
+        return self == Sex.MALE
+
+    def is_male(self) -> bool:
+        """
+        Return `True` if the sex represents a `MALE`.
+        """
+        return self == Sex.MALE
+
 
 class SampleLabels:
     """
diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py
index 3043d3419..c44a214b8 100644
--- a/tests/analysis/predicate/genotype/conftest.py
+++ b/tests/analysis/predicate/genotype/conftest.py
@@ -192,9 +192,7 @@ def patient_w_missense(
         sex=Sex.UNKNOWN_SEX,
         phenotypes=(),
         diseases=(),
-        variants=(
-            missense_variant,
-        ),
+        variants=(missense_variant,),
     )
 
 
@@ -208,7 +206,399 @@ def patient_w_frameshift(
         sex=Sex.UNKNOWN_SEX,
         phenotypes=(),
         diseases=(),
-        variants=(
-            frameshift_variant,
+        variants=(frameshift_variant,),
+    )
+
+
+"""
+Genesis family - Autosomal dominant
+
+* Adam - father, affected
+* Eve - mother, unaffected
+* Cain - son, affected
+"""
+
+
+@pytest.fixture(scope="package")
+def genesis_mutation(
+    genome_build: GenomeBuild,
+    adam_label: SampleLabels,
+    eve_label: SampleLabels,
+    cain_label: SampleLabels,
+) -> Variant:
+    chr22 = genome_build.contig_by_name("chr22")
+    assert chr22 is not None
+    return Variant(
+        variant_info=VariantInfo(
+            variant_coordinates=VariantCoordinates(
+                region=GenomicRegion(
+                    contig=chr22,
+                    start=100,
+                    end=101,
+                    strand=Strand.POSITIVE,
+                ),
+                ref="C",
+                alt="G",
+                change_length=0,
+            )
+        ),
+        tx_annotations=(
+            TranscriptAnnotation(
+                gene_id="a_gene",
+                tx_id="tx:xyz",
+                hgvs_cdna=None,
+                is_preferred=False,
+                variant_effects=(
+                    VariantEffect.MISSENSE_VARIANT,
+                    VariantEffect.SPLICE_DONOR_VARIANT,
+                ),
+                affected_exons=(4,),
+                protein_id="pt:xyz",
+                hgvsp=None,
+                protein_effect_coordinates=Region(40, 41),
+            ),
+        ),
+        genotypes=Genotypes.from_mapping(
+            {
+                adam_label: Genotype.HETEROZYGOUS,
+                eve_label: Genotype.HOMOZYGOUS_REFERENCE,
+                cain_label: Genotype.HETEROZYGOUS,
+            }
+        ),
+    )
+
+
+@pytest.fixture(scope="package")
+def adam_label() -> SampleLabels:
+    return SampleLabels("Adam")
+
+
+@pytest.fixture(scope="package")
+def adam(
+    adam_label: SampleLabels,
+    genesis_mutation: Variant,
+) -> Patient:
+    return Patient(
+        adam_label,
+        sex=Sex.MALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(genesis_mutation,),
+    )
+
+
+@pytest.fixture(scope="package")
+def eve_label() -> SampleLabels:
+    return SampleLabels("Eve")
+
+
+@pytest.fixture(scope="package")
+def eve(
+    eve_label: SampleLabels,
+    genesis_mutation: Variant,
+) -> Patient:
+    return Patient(
+        eve_label,
+        sex=Sex.FEMALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(genesis_mutation,),
+    )
+
+
+@pytest.fixture(scope="package")
+def cain_label() -> SampleLabels:
+    return SampleLabels("Cain")
+
+
+@pytest.fixture(scope="package")
+def cain(
+    cain_label: SampleLabels,
+    genesis_mutation: Variant,
+) -> Patient:
+    return Patient(
+        cain_label,
+        sex=Sex.MALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(genesis_mutation,),
+    )
+
+
+"""
+White family - Autosomal recessive
+
+* Walt - father, HET
+* Skyler - mother, HET
+* Flynn - son, HOM_ALT
+* Holly - daughter, HOM_REF
+"""
+
+
+@pytest.fixture(scope="package")
+def white_mutation(
+    genome_build: GenomeBuild,
+    walt_label: SampleLabels,
+    skyler_label: SampleLabels,
+    flynn_label: SampleLabels,
+    holly_label: SampleLabels,
+) -> Variant:
+    chr22 = genome_build.contig_by_name("chr22")
+    assert chr22 is not None
+    return Variant(
+        variant_info=VariantInfo(
+            variant_coordinates=VariantCoordinates(
+                region=GenomicRegion(
+                    contig=chr22,
+                    start=100,
+                    end=101,
+                    strand=Strand.POSITIVE,
+                ),
+                ref="C",
+                alt="G",
+                change_length=0,
+            )
+        ),
+        tx_annotations=(
+            TranscriptAnnotation(
+                gene_id="a_gene",
+                tx_id="tx:xyz",
+                hgvs_cdna=None,
+                is_preferred=False,
+                variant_effects=(
+                    VariantEffect.MISSENSE_VARIANT,
+                    VariantEffect.SPLICE_DONOR_VARIANT,
+                ),
+                affected_exons=(4,),
+                protein_id="pt:xyz",
+                hgvsp=None,
+                protein_effect_coordinates=Region(40, 41),
+            ),
+        ),
+        genotypes=Genotypes.from_mapping(
+            {
+                walt_label: Genotype.HETEROZYGOUS,
+                skyler_label: Genotype.HETEROZYGOUS,
+                flynn_label: Genotype.HOMOZYGOUS_ALTERNATE,
+                holly_label: Genotype.HOMOZYGOUS_REFERENCE,
+            }
         ),
     )
+
+
+@pytest.fixture(scope="package")
+def walt_label() -> SampleLabels:
+    return SampleLabels("Walt")
+
+
+@pytest.fixture(scope="package")
+def walt(
+    walt_label: SampleLabels,
+    white_mutation: Variant,
+) -> Patient:
+    return Patient(
+        walt_label,
+        sex=Sex.MALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(white_mutation,),
+    )
+
+
+@pytest.fixture(scope="package")
+def skyler_label() -> SampleLabels:
+    return SampleLabels("Skyler")
+
+
+@pytest.fixture(scope="package")
+def skyler(
+    skyler_label: SampleLabels,
+    white_mutation: Variant,
+) -> Patient:
+    return Patient(
+        skyler_label,
+        sex=Sex.FEMALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(white_mutation,),
+    )
+
+
+@pytest.fixture(scope="package")
+def flynn_label() -> SampleLabels:
+    return SampleLabels("Flynn")
+
+
+@pytest.fixture(scope="package")
+def flynn(
+    flynn_label: SampleLabels,
+    white_mutation: Variant,
+) -> Patient:
+    return Patient(
+        flynn_label,
+        sex=Sex.MALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(white_mutation,),
+    )
+
+
+@pytest.fixture(scope="package")
+def holly_label() -> SampleLabels:
+    return SampleLabels("Holly")
+
+
+@pytest.fixture(scope="package")
+def holly(
+    holly_label: SampleLabels,
+    white_mutation: Variant,
+) -> Patient:
+    return Patient(
+        holly_label,
+        sex=Sex.FEMALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(white_mutation,),
+    )
+
+
+"""
+Skywalker family - X-linked recessive
+
+* Anakin - father, homozygous reference  (possibly hemizygous reference?)
+* Padme - mother, heterozygous
+* Luke - son, hemizygous
+* Leia - daughter, heterozygous
+"""
+
+
+@pytest.fixture(scope="package")
+def skywalker_mutation(
+    genome_build: GenomeBuild,
+    anakin_label: SampleLabels,
+    padme_label: SampleLabels,
+    luke_label: SampleLabels,
+    leia_label: SampleLabels,
+) -> Variant:
+    chrX = genome_build.contig_by_name("chrX")
+    assert chrX is not None
+    return Variant(
+        variant_info=VariantInfo(
+            variant_coordinates=VariantCoordinates(
+                region=GenomicRegion(
+                    contig=chrX,
+                    start=100,
+                    end=101,
+                    strand=Strand.POSITIVE,
+                ),
+                ref="C",
+                alt="G",
+                change_length=0,
+            )
+        ),
+        tx_annotations=(
+            TranscriptAnnotation(
+                gene_id="a_gene",
+                tx_id="tx:xyz",
+                hgvs_cdna=None,
+                is_preferred=False,
+                variant_effects=(
+                    VariantEffect.MISSENSE_VARIANT,
+                    VariantEffect.SPLICE_DONOR_VARIANT,
+                ),
+                affected_exons=(4,),
+                protein_id="pt:xyz",
+                hgvsp=None,
+                protein_effect_coordinates=Region(40, 41),
+            ),
+        ),
+        genotypes=Genotypes.from_mapping(
+            {
+                anakin_label: Genotype.HOMOZYGOUS_REFERENCE,
+                padme_label: Genotype.HETEROZYGOUS,
+                luke_label: Genotype.HEMIZYGOUS,
+                leia_label: Genotype.HETEROZYGOUS,
+            }
+        ),
+    )
+
+
+@pytest.fixture(scope="package")
+def anakin_label() -> SampleLabels:
+    return SampleLabels("Anakin")
+
+
+@pytest.fixture(scope="package")
+def anakin(
+    anakin_label: SampleLabels,
+    skywalker_mutation: Variant,
+) -> Patient:
+    return Patient(
+        anakin_label,
+        sex=Sex.MALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(skywalker_mutation,),
+    )
+
+
+@pytest.fixture(scope="package")
+def padme_label() -> SampleLabels:
+    return SampleLabels("Padme")
+
+
+@pytest.fixture(scope="package")
+def padme(
+    padme_label: SampleLabels,
+    skywalker_mutation: Variant,
+) -> Patient:
+    return Patient(
+        padme_label,
+        sex=Sex.FEMALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(skywalker_mutation,),
+    )
+
+
+@pytest.fixture(scope="package")
+def luke_label() -> SampleLabels:
+    return SampleLabels("Luke")
+
+
+@pytest.fixture(scope="package")
+def luke(
+    luke_label: SampleLabels,
+    skywalker_mutation: Variant,
+) -> Patient:
+    return Patient(
+        luke_label,
+        sex=Sex.MALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(skywalker_mutation,),
+    )
+
+
+@pytest.fixture(scope="package")
+def leia_label() -> SampleLabels:
+    return SampleLabels("Leia")
+
+
+@pytest.fixture(scope="package")
+def leia(
+    leia_label: SampleLabels,
+    skywalker_mutation: Variant,
+) -> Patient:
+    return Patient(
+        leia_label,
+        sex=Sex.FEMALE,
+        phenotypes=(),
+        diseases=(),
+        variants=(skywalker_mutation,),
+    )
+
+
+"""
+XR family
+"""
diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
index 3e71283b4..ac6977c8e 100644
--- a/tests/analysis/predicate/genotype/test_gt_predicates.py
+++ b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -5,22 +5,23 @@
     GenotypePolyPredicate,
     groups_predicate,
     VariantPredicates,
+    VariantPredicate,
+    ModeOfInheritancePredicate,
 )
 
 
-class TestGroupsPredicate:
+TX_ID = "tx:xyz"
+
 
-    TX_ID = "tx:xyz"
+class TestGroupsPredicate:
 
     @pytest.fixture(scope="class")
     def predicate(self) -> GenotypePolyPredicate:
         return groups_predicate(
             predicates=(
+                VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID),
                 VariantPredicates.variant_effect(
-                    VariantEffect.MISSENSE_VARIANT, TestGroupsPredicate.TX_ID
-                ),
-                VariantPredicates.variant_effect(
-                    VariantEffect.FRAMESHIFT_VARIANT, TestGroupsPredicate.TX_ID
+                    VariantEffect.FRAMESHIFT_VARIANT, TX_ID
                 ),
             ),
             group_names=(
@@ -53,7 +54,7 @@ def test_get_categorizations(
 
     def test_test__missense(
         self,
-        patient_w_missense: Variant,
+        patient_w_missense: Patient,
         predicate: GenotypePolyPredicate,
     ):
         cat = predicate.test(patient_w_missense)
@@ -65,7 +66,7 @@ def test_test__missense(
 
     def test_test__frameshift(
         self,
-        patient_w_frameshift: Variant,
+        patient_w_frameshift: Patient,
         predicate: GenotypePolyPredicate,
     ):
         cat = predicate.test(patient_w_frameshift)
@@ -74,3 +75,90 @@ def test_test__frameshift(
         assert cat.category.cat_id == 1
         assert cat.category.name == "LoF"
         assert cat.category.description == "FRAMESHIFT_VARIANT on tx:xyz"
+
+
+class TestModeOfInheritancePredicate:
+
+    @pytest.fixture(scope="class")
+    def variant_predicate(self) -> VariantPredicate:
+        return VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID)
+
+    @pytest.mark.parametrize(
+        "patient_name,cat_id,name",
+        [
+            ("adam", 1, "0/1"),
+            ("eve", 0, "0/0"),
+            ("cain", 1, "0/1"),
+        ],
+    )
+    def test_autosomal_dominant(
+        self,
+        patient_name: str,
+        cat_id: int,
+        name: str,
+        variant_predicate: VariantPredicate,
+        request: pytest.FixtureRequest,
+    ):
+        patient = request.getfixturevalue(patient_name)
+        predicate = ModeOfInheritancePredicate.autosomal_dominant(variant_predicate)
+
+        categorization = predicate.test(patient)
+
+        assert categorization is not None
+
+        assert categorization.category.cat_id == cat_id
+        assert categorization.category.name == name
+
+    @pytest.mark.parametrize(
+        "patient_name,cat_id,name",
+        [
+            ("walt", 1, "0/1"),
+            ("skyler", 1, "0/1"),
+            ("flynn", 2, "1/1"),
+            ("holly", 0, "0/0"),
+        ],
+    )
+    def test_autosomal_recessive(
+        self,
+        patient_name: str,
+        cat_id: int,
+        name: str,
+        variant_predicate: VariantPredicate,
+        request: pytest.FixtureRequest,
+    ):
+        patient = request.getfixturevalue(patient_name)
+        predicate = ModeOfInheritancePredicate.autosomal_recessive(variant_predicate)
+
+        categorization = predicate.test(patient)
+
+        assert categorization is not None
+
+        assert categorization.category.cat_id == cat_id
+        assert categorization.category.name == name
+
+    @pytest.mark.parametrize(
+        "patient_name,cat_id,name",
+        [
+            ("anakin", 0, "0/0"),
+            ("padme", 1, "0/1"),
+            ("luke", 2, "1"),
+            ("leia", 1, "0/1"),
+        ],
+    )
+    def test_x_recessive(
+        self,
+        patient_name: str,
+        cat_id: int,
+        name: str,
+        variant_predicate: VariantPredicate,
+        request: pytest.FixtureRequest,
+    ):
+        patient = request.getfixturevalue(patient_name)
+        predicate = ModeOfInheritancePredicate.x_recessive(variant_predicate)
+
+        categorization = predicate.test(patient)
+
+        assert categorization is not None
+
+        assert categorization.category.cat_id == cat_id
+        assert categorization.category.name == name

From ecf589c2220bb2e14d56806c1f8fbff0c0701eb5 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Fri, 30 Aug 2024 10:28:48 +0200
Subject: [PATCH 05/16] Move `DeVriesScorer` test elsewhere.

---
 tests/analysis/{ => pscore}/test_de_vries_scorer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 rename tests/analysis/{ => pscore}/test_de_vries_scorer.py (97%)

diff --git a/tests/analysis/test_de_vries_scorer.py b/tests/analysis/pscore/test_de_vries_scorer.py
similarity index 97%
rename from tests/analysis/test_de_vries_scorer.py
rename to tests/analysis/pscore/test_de_vries_scorer.py
index 220dd18b0..89679a37f 100644
--- a/tests/analysis/test_de_vries_scorer.py
+++ b/tests/analysis/pscore/test_de_vries_scorer.py
@@ -4,7 +4,7 @@
 import pytest
 
 from gpsea.analysis.pscore import DeVriesPhenotypeScorer
-from gpsea.model import Patient, SampleLabels, Phenotype
+from gpsea.model import Patient, SampleLabels, Phenotype, Sex
 
 intrauterine_growth_retardation = 'HP:0001511'
 small_for_gestational_age = 'HP:0001518'
@@ -69,6 +69,7 @@ def test_a_patient(
     ):
         patient = Patient(
             labels=SampleLabels("test"),
+            sex=Sex.UNKNOWN_SEX,
             phenotypes=(
                 Phenotype.from_raw_parts(
                     term_id=curie,

From 0eb40a6d711dd1dfd16ab028441a5b432b78de43 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Fri, 30 Aug 2024 10:32:55 +0200
Subject: [PATCH 06/16] Fine tune the inheritance mode assignment.

---
 .../analysis/predicate/genotype/_gt_predicates.py  | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index 601aa70bc..1cd1a6577 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -553,9 +553,17 @@ def test(
             if patient.sex.is_provided():
                 allele_count = self._allele_counter.count(patient)
                 groups = self._moi_info.get_groups_for_allele_count(allele_count)
-                for group in groups:
-                    if group.sex is not None and group.sex == patient.sex:
-                        return group.categorization
+                if len(groups) == 0:
+                    # Unable to assign the individual.
+                    return None
+                elif len(groups) == 1:
+                    # We can only assign into one category no matter what the individual's sex is.
+                    return groups[0].categorization
+                else:
+                    # We choose depending on the sex.
+                    for group in groups:
+                        if group.sex is not None and group.sex == patient.sex:
+                            return group.categorization
                 return None
             else:
                 # We must have patient's sex

From 29d126551ecc3b50f47d9a99a9d4cc40fdc3a9e3 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Fri, 30 Aug 2024 16:00:53 +0200
Subject: [PATCH 07/16] Improve documentation for predicates.

---
 docs/report/tbx5_frameshift_vs_missense.csv   | 244 +-------------
 docs/tutorial.rst                             |   6 +-
 docs/user-guide/predicates.rst                | 315 ++++++++++++++++--
 docs/user-guide/report/tbx5_frameshift.csv    | 282 ++--------------
 docs/user-guide/stats.rst                     |  45 ++-
 .../predicate/genotype/_gt_predicates.py      |  89 +++--
 src/gpsea/model/_base.py                      |   6 +-
 tests/analysis/predicate/genotype/conftest.py |  15 +-
 .../predicate/genotype/test_gt_predicates.py  |  58 ++--
 9 files changed, 445 insertions(+), 615 deletions(-)

diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv
index 421e69272..07245a440 100644
--- a/docs/report/tbx5_frameshift_vs_missense.csv
+++ b/docs/report/tbx5_frameshift_vs_missense.csv
@@ -16,247 +16,7 @@ Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.9520604334894502,0.77
 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0
 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0
 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
-Abnormal carpal morphology [HP:0001191],30/32,94%,0/0,0%,,
-Abnormal hand morphology [HP:0005922],53/53,100%,20/20,100%,,
-Abnormality of the hand [HP:0001155],60/60,100%,31/31,100%,,
+Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,,
+Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,,
 Abnormality of the upper limb [HP:0002817],73/73,100%,34/34,100%,,
 Abnormality of limbs [HP:0040064],73/73,100%,34/34,100%,,
-Phenotypic abnormality [HP:0000118],82/82,100%,38/38,100%,,
-All [HP:0000001],82/82,100%,38/38,100%,,
-Abnormality of the wrist [HP:0003019],30/30,100%,0/0,0%,,
-Abnormality of upper limb joint [HP:0009810],30/30,100%,6/6,100%,,
-Abnormal joint morphology [HP:0001367],31/31,100%,6/6,100%,,
-Abnormal skeletal morphology [HP:0011842],73/73,100%,35/35,100%,,
-Abnormality of the skeletal system [HP:0000924],73/73,100%,35/35,100%,,
-Abnormality of the musculoskeletal system [HP:0033127],74/74,100%,35/35,100%,,
-Short thumb [HP:0009778],11/41,27%,8/30,27%,,
-Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,,
-Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,,
-Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,,
-Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],55/55,100%,22/22,100%,,
-Aplasia/hypoplasia involving bones of the extremities [HP:0045060],55/55,100%,22/22,100%,,
-Aplasia/hypoplasia of the extremities [HP:0009815],55/55,100%,22/22,100%,,
-Aplasia/hypoplasia involving the skeleton [HP:0009115],56/56,100%,23/23,100%,,
-Abnormal limb bone morphology [HP:0002813],63/63,100%,34/34,100%,,
-Abnormality of limb bone [HP:0040068],63/63,100%,34/34,100%,,
-Abnormal appendicular skeleton morphology [HP:0011844],64/64,100%,34/34,100%,,
-Abnormal finger morphology [HP:0001167],36/36,100%,31/31,100%,,
-Abnormal digit morphology [HP:0011297],38/38,100%,33/33,100%,,
-Abnormal thumb morphology [HP:0001172],30/30,100%,31/31,100%,,
-Short finger [HP:0009381],11/11,100%,8/8,100%,,
-Short digit [HP:0011927],11/11,100%,10/10,100%,,
-Complete atrioventricular canal defect [HP:0001674],5/37,14%,3/36,8%,,
-Atrioventricular canal defect [HP:0006695],5/5,100%,3/3,100%,,
-Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,,
-Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,,
-Abnormal cardiovascular system morphology [HP:0030680],63/63,100%,30/30,100%,,
-Abnormality of the cardiovascular system [HP:0001626],65/65,100%,32/32,100%,,
-11 pairs of ribs [HP:0000878],1/1,100%,0/0,0%,,
-Missing ribs [HP:0000921],1/1,100%,0/0,0%,,
-Aplasia/Hypoplasia of the ribs [HP:0006712],1/1,100%,0/0,0%,,
-Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],1/1,100%,2/2,100%,,
-Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,2/2,100%,,
-Abnormal axial skeleton morphology [HP:0009121],8/8,100%,5/5,100%,,
-Abnormal thorax morphology [HP:0000765],6/6,100%,5/5,100%,,
-Abnormal rib morphology [HP:0000772],1/1,100%,0/0,0%,,
-Abnormal rib cage morphology [HP:0001547],4/4,100%,0/0,0%,,
-Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,,
-Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,,
-Abnormal cardiac atrium morphology [HP:0005120],43/43,100%,20/20,100%,,
-Finger aplasia [HP:0009380],15/15,100%,14/14,100%,,
-Aplasia involving forearm bones [HP:0009822],7/7,100%,6/6,100%,,
-Aplasia/hypoplasia involving forearm bones [HP:0006503],37/37,100%,12/12,100%,,
-Abnormal forearm bone morphology [HP:0040072],37/37,100%,14/14,100%,,
-Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,,
-Abnormal forearm morphology [HP:0002973],37/37,100%,14/14,100%,,
-Aplasia/Hypoplasia of the radius [HP:0006501],37/37,100%,11/11,100%,,
-Abnormal morphology of the radius [HP:0002818],37/37,100%,13/13,100%,,
-Absent forearm bone [HP:0003953],7/7,100%,6/6,100%,,
-Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,,
-Upper limb undergrowth [HP:0009824],33/33,100%,7/7,100%,,
-Limb undergrowth [HP:0009826],33/33,100%,7/7,100%,,
-Short long bone [HP:0003026],35/35,100%,9/9,100%,,
-Abnormal long bone morphology [HP:0011314],44/44,100%,13/13,100%,,
-Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,,
-Abnormality of thumb phalanx [HP:0009602],13/13,100%,13/13,100%,,
-Limited pronation/supination of forearm [HP:0006394],0/0,0%,3/3,100%,,
-Limited elbow movement [HP:0002996],0/0,0%,4/4,100%,,
-Abnormality of the elbow [HP:0009811],0/0,0%,5/5,100%,,
-Limitation of joint mobility [HP:0001376],0/0,0%,4/4,100%,,
-Abnormality of joint mobility [HP:0011729],1/1,100%,5/5,100%,,
-Abnormal joint physiology [HP:0034430],1/1,100%,5/5,100%,,
-Abnormal musculoskeletal physiology [HP:0011843],1/1,100%,5/5,100%,,
-Abnormality of cardiovascular system electrophysiology [HP:0030956],15/15,100%,3/3,100%,,
-Abnormal cardiovascular system physiology [HP:0011025],23/23,100%,5/5,100%,,
-Pre-capillary pulmonary hypertension [HP:0033578],4/4,100%,0/0,0%,,
-Elevated pulmonary artery pressure [HP:0004890],4/4,100%,0/0,0%,,
-Abnormality of pulmonary circulation [HP:0030875],4/4,100%,0/0,0%,,
-Abnormal vascular physiology [HP:0030163],4/4,100%,0/0,0%,,
-Abnormality of the vasculature [HP:0002597],10/10,100%,2/2,100%,,
-Abnormal respiratory system physiology [HP:0002795],4/4,100%,0/0,0%,,
-Abnormality of the respiratory system [HP:0002086],4/4,100%,0/0,0%,,
-Tricuspid regurgitation [HP:0005180],3/3,100%,0/0,0%,,
-Atrioventricular valve regurgitation [HP:0034376],4/4,100%,2/2,100%,,
-Abnormal atrioventricular valve physiology [HP:0031650],4/4,100%,2/2,100%,,
-Abnormal heart valve physiology [HP:0031653],4/4,100%,2/2,100%,,
-Abnormal tricuspid valve physiology [HP:0031651],3/3,100%,0/0,0%,,
-Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,0/0,0%,,
-Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,0/0,0%,,
-Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,0/0,0%,,
-Abnormal finger phalanx morphology [HP:0005918],3/3,100%,0/0,0%,,
-Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,0/0,0%,,
-Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,0/0,0%,,
-Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,0/0,0%,,
-Aplasia involving bones of the extremities [HP:0009825],0/0,0%,0/0,0%,,
-Aplasia of metacarpal bones [HP:0010048],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,0/0,0%,,
-Abnormal metacarpal morphology [HP:0005916],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,0/0,0%,,
-Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,0/0,0%,,
-Partial absence of thumb [HP:0009659],0/0,0%,0/0,0%,,
-1-2 finger syndactyly [HP:0010704],3/3,100%,1/1,100%,,
-Finger syndactyly [HP:0006101],4/4,100%,2/2,100%,,
-Syndactyly [HP:0001159],4/4,100%,2/2,100%,,
-Upper limb phocomelia [HP:0009813],8/85,9%,2/37,5%,,
-Phocomelia [HP:0009829],8/8,100%,2/2,100%,,
-Clinodactyly of the 5th finger [HP:0004209],1/1,100%,0/0,0%,,
-Finger clinodactyly [HP:0040019],1/1,100%,0/0,0%,,
-Clinodactyly [HP:0030084],1/1,100%,0/0,0%,,
-Deviation of finger [HP:0004097],1/1,100%,2/2,100%,,
-Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,2/2,100%,,
-Deviation of the 5th finger [HP:0009179],1/1,100%,0/0,0%,,
-Abnormal 5th finger morphology [HP:0004207],4/4,100%,0/0,0%,,
-Patent foramen ovale [HP:0001655],4/40,10%,0/36,0%,,
-Small hypothenar eminence [HP:0010487],2/2,100%,0/0,0%,,
-Abnormality of the hypothenar eminence [HP:0010486],2/2,100%,0/0,0%,,
-Abnormality of the musculature of the hand [HP:0001421],2/2,100%,0/0,0%,,
-Abnormality of the musculature of the upper limbs [HP:0001446],2/2,100%,0/0,0%,,
-Abnormality of the musculature of the limbs [HP:0009127],2/2,100%,0/0,0%,,
-Abnormal skeletal muscle morphology [HP:0011805],2/2,100%,0/0,0%,,
-Abnormality of the musculature [HP:0003011],2/2,100%,0/0,0%,,
-Perimembranous ventricular septal defect [HP:0011682],3/59,5%,3/25,12%,,
-Deviation of the thumb [HP:0009603],0/0,0%,2/2,100%,,
-Aplasia/hypoplasia of the humerus [HP:0006507],7/7,100%,4/4,100%,,
-Abnormality of the humerus [HP:0003063],7/7,100%,4/4,100%,,
-Abnormality of the upper arm [HP:0001454],7/7,100%,4/4,100%,,
-Right atrial enlargement [HP:0030718],4/4,100%,0/0,0%,,
-Abnormal right atrium morphology [HP:0025580],4/4,100%,0/0,0%,,
-Atrial septal dilatation [HP:0011995],4/4,100%,0/0,0%,,
-Pectus excavatum [HP:0000767],3/4,75%,2/2,100%,,
-Abnormal sternum morphology [HP:0000766],3/3,100%,2/2,100%,,
-Postaxial hand polydactyly [HP:0001162],3/4,75%,0/0,0%,,
-Postaxial polydactyly [HP:0100259],3/3,100%,0/0,0%,,
-Polydactyly [HP:0010442],3/3,100%,0/0,0%,,
-Hand polydactyly [HP:0001161],3/3,100%,0/0,0%,,
-Duplication of phalanx of hand [HP:0009997],3/3,100%,0/0,0%,,
-Duplication of hand bones [HP:0004275],3/3,100%,0/0,0%,,
-Duplication of bones involving the upper extremities [HP:0009142],3/3,100%,0/0,0%,,
-High palate [HP:0000218],3/3,100%,0/0,0%,,
-Abnormal palate morphology [HP:0000174],5/5,100%,0/0,0%,,
-Abnormal oral cavity morphology [HP:0000163],5/5,100%,1/1,100%,,
-Abnormal oral morphology [HP:0031816],5/5,100%,1/1,100%,,
-Abnormality of the mouth [HP:0000153],5/5,100%,1/1,100%,,
-Abnormality of the face [HP:0000271],5/5,100%,1/1,100%,,
-Abnormality of the head [HP:0000234],5/5,100%,2/2,100%,,
-Abnormality of head or neck [HP:0000152],5/5,100%,2/2,100%,,
-Short neck [HP:0000470],3/3,100%,0/0,0%,,
-Abnormal neck morphology [HP:0025668],3/3,100%,0/0,0%,,
-Abnormality of the neck [HP:0000464],3/3,100%,0/0,0%,,
-Abnormality of the cervical spine [HP:0003319],3/3,100%,0/0,0%,,
-Abnormality of the vertebral column [HP:0000925],4/4,100%,1/1,100%,,
-Shield chest [HP:0000914],3/3,100%,0/0,0%,,
-Enlarged thorax [HP:0100625],3/3,100%,0/0,0%,,
-Y-shaped metatarsals [HP:0010567],3/3,100%,0/0,0%,,
-Abnormal metatarsal morphology [HP:0001832],3/3,100%,0/0,0%,,
-Abnormal lower limb bone morphology [HP:0040069],3/3,100%,0/0,0%,,
-Abnormality of the lower limb [HP:0002814],3/3,100%,0/0,0%,,
-Abnormal foot morphology [HP:0001760],3/3,100%,0/0,0%,,
-Mitral regurgitation [HP:0001653],1/1,100%,2/2,100%,,
-Abnormal mitral valve physiology [HP:0031481],1/1,100%,2/2,100%,,
-First degree atrioventricular block [HP:0011705],0/22,0%,1/1,100%,,
-Congenital malformation of the great arteries [HP:0011603],4/4,100%,2/2,100%,,
-Abnormal morphology of the great vessels [HP:0030962],6/6,100%,2/2,100%,,
-Abnormal blood vessel morphology [HP:0033353],6/6,100%,2/2,100%,,
-Abnormal vascular morphology [HP:0025015],6/6,100%,2/2,100%,,
-Abnormal skull morphology [HP:0000929],1/1,100%,2/2,100%,,
-Sinus bradycardia [HP:0001688],0/0,0%,1/1,100%,,
-Abnormal electrophysiology of sinoatrial node origin [HP:0011702],0/0,0%,1/1,100%,,
-Arrhythmia [HP:0011675],1/1,100%,1/1,100%,,
-Bradycardia [HP:0001662],0/0,0%,1/1,100%,,
-Abnormal skin morphology [HP:0011121],0/0,0%,1/1,100%,,
-Abnormality of the skin [HP:0000951],0/0,0%,1/1,100%,,
-Abnormality of the integument [HP:0001574],0/0,0%,1/1,100%,,
-Sinus venosus atrial septal defect [HP:0011567],0/2,0%,1/1,100%,,
-Abnormal morphology of ulna [HP:0040071],2/2,100%,4/4,100%,,
-Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,2/2,100%,,
-Persistent left superior vena cava [HP:0005301],2/37,5%,0/0,0%,,
-Abnormal superior vena cava morphology [HP:0025575],2/2,100%,0/0,0%,,
-Abnormal vena cava morphology [HP:0005345],2/2,100%,0/0,0%,,
-Abnormal venous morphology [HP:0002624],2/2,100%,0/0,0%,,
-Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,,
-Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,0/0,0%,,
-Shoulder muscle hypoplasia [HP:0008952],0/0,0%,0/0,0%,,
-Hypoplasia of the musculature [HP:0009004],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,0/0,0%,,
-Abnormality of muscle size [HP:0030236],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,0/0,0%,,
-Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,0/0,0%,,
-Common atrium [HP:0011565],0/83,0%,0/38,0%,,
-Unroofed coronary sinus [HP:0031297],0/85,0%,0/38,0%,,
-Atrioventricular dissociation [HP:0011709],0/22,0%,1/1,100%,,
-Bowed forearm bones [HP:0003956],0/0,0%,1/1,100%,,
-Bowing of the arm [HP:0006488],0/0,0%,1/1,100%,,
-Bowing of the long bones [HP:0006487],0/0,0%,1/1,100%,,
-Abnormal diaphysis morphology [HP:0000940],0/0,0%,1/1,100%,,
-Proximal placement of thumb [HP:0009623],0/0,0%,0/0,0%,,
-Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,0/0,0%,,
-Type A brachydactyly [HP:0009370],0/0,0%,0/0,0%,,
-Brachydactyly [HP:0001156],0/0,0%,0/0,0%,,
-Short 5th finger [HP:0009237],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,0/0,0%,,
-Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,0/0,0%,,
-Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,0/0,0%,,
-Short middle phalanx of finger [HP:0005819],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,0/0,0%,,
-Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,0/0,0%,,
-Short phalanx of finger [HP:0009803],0/0,0%,0/0,0%,,
-Micrognathia [HP:0000347],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,,
-Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,,
-Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,,
-Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,,
-Cleft soft palate [HP:0000185],2/2,100%,0/0,0%,,
-Abnormal soft palate morphology [HP:0100736],2/2,100%,0/0,0%,,
-Cleft palate [HP:0000175],2/2,100%,0/0,0%,,
-Orofacial cleft [HP:0000202],2/2,100%,0/0,0%,,
-Craniofacial cleft [HP:5201015],2/2,100%,0/0,0%,,
-Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,,
-Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,,
-Amelia involving the upper limbs [HP:0009812],0/83,0%,1/37,3%,,
-Third degree atrioventricular block [HP:0001709],0/22,0%,1/1,100%,,
-Upper extremity joint dislocation [HP:0030310],0/0,0%,2/2,100%,,
-Joint dislocation [HP:0001373],0/0,0%,2/2,100%,,
-Abnormal toe morphology [HP:0001780],0/0,0%,0/0,0%,,
-Abnormal toe phalanx morphology [HP:0010161],0/0,0%,0/0,0%,,
-Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the 2nd finger [HP:0006264],1/1,100%,0/0,0%,,
-Abnormal 2nd finger morphology [HP:0004100],1/1,100%,0/0,0%,,
-Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,,
-Left ventricular noncompaction cardiomyopathy [HP:0011664],0/1,0%,1/5,20%,,
-Noncompaction cardiomyopathy [HP:0012817],0/0,0%,1/1,100%,,
-Cardiomyopathy [HP:0001638],0/0,0%,1/1,100%,,
-Abnormal myocardium morphology [HP:0001637],0/0,0%,1/1,100%,,
-Mitral valve prolapse [HP:0001634],0/0,0%,1/1,100%,,
-Abnormal mitral valve morphology [HP:0001633],0/0,0%,1/1,100%,,
-Abnormal atrioventricular valve morphology [HP:0006705],0/0,0%,1/1,100%,,
-Abnormal heart valve morphology [HP:0001654],0/0,0%,1/1,100%,,
-Short 1st metacarpal [HP:0010034],0/30,0%,0/22,0%,,
-Short phalanx of the thumb [HP:0009660],0/30,0%,0/22,0%,,
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 10ce6a23a..8d1d47268 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -266,11 +266,11 @@ by exploring the phenotype MTC filtering report.
 .. raw:: html
   :file: report/tbx5_frameshift_vs_missense.mtc_report.html
 
-and these are the HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure:
+and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure:
 
 >>> from gpsea.analysis.predicate import PatientCategories
 >>> summary_df = result.summarize(hpo, PatientCategories.YES)
->>> summary_df.to_csv('docs/report/tbx5_frameshift_vs_missense.csv')  # doctest: +SKIP
+>>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv')  # doctest: +SKIP
 
 .. csv-table:: *TBX5* frameshift vs missense
    :file: report/tbx5_frameshift_vs_missense.csv
@@ -283,4 +283,4 @@ was observed in 31/60 (52%) patients with a missense variant
 but it was observed in 19/19 (100%) patients with a frameshift variant.
 Fisher exact test computed a p value of `~0.0000562`
 and the p value corrected by Benjamini-Hochberg procedure
-is `~0.00112`.
+is `~0.000899`.
diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst
index 00647f1f2..908afa147 100644
--- a/docs/user-guide/predicates.rst
+++ b/docs/user-guide/predicates.rst
@@ -4,23 +4,80 @@
 Predicates
 ==========
 
-GPSEA uses predicates to test if variant, patient, or any tested item 
-meets a condition. Based on the test results, the items are assigned into groups.
+Searching for genotype-phenotype associations usually requires to partition
+the individuals into several discrete groups to allow testing for the inter-group differences.
+GPSEA reflects these requirements with its predicate API.
+Perhaps unsurprisingly, a predicate must be capable of partitioning the individuals into two or more groups.
+The groups must be *exclusive* - each individual must be assigned at most into one group.
+Moreover, the groups should be *exhaustive* and cover maximum of the possible states.
+However, the predicate is allowed to return `None` if the individual cannot be assigned.
+In result, the individual will be omitted from the downstream analysis.
+
+Predicates serve both *genotype* and *phenotype* prongs of the analysis.
+Genotype predicates (:class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`)
+assign the :class:`~gpsea.model.Patient`
+into a group (mostly) based on the variant information, while the
+phenotype predicates (:class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate`)
+use the HPO terms to assign a group.
+
+It is literally impossible to use GPSEA without the predicates
+because all analyses need at least one predicate (typically a *genotype* predicate).
+Luckily, the purpose of this guide is to show all that is to know about predicates.
+We will first discuss the genotype predicates and end with phenotype predicates.
+
+.. _genotype-predicates:
+
+*******************
+Genotype predicates
+*******************
+
+A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes.
+Typically, this includes using the genotype data, such as presence of a missense variant
+in a heterozygous genotype. However, other categorical variables,
+such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used.
+
+The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria.
+The testing is done in two steps. First, we count the alleles
+of the matching variants and then we interpret the count, possibly including factors
+such as the expected mode of inheritance and sex, to assign the individual into a group.
+Finding the matching variants is what
+the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about.
+
+
+TODO: wordsmith
+We must first create the variant predicate and then wrap it in genotype predicate.
 
-As described in the :class:`~gpsea.analysis.predicate.PolyPredicate` API, 
-the groups must be *exclusive* - the item can be assigned with one and only one group,
-and *exhaustive* - the groups must cover all possible scenarios.
 
-However, if the item cannot be assigned into any meaningful category, 
-the predicate can return `None`, and the item will be omitted from the analysis.
+Variant predicates
+==================
 
-The predicates can be chained to test for more complex conditions. 
-For instance, "test if a patient has a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`".
+GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class
+to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria.
+The variant predicate can leverage multiple primary data:
 
-Let's demonstrate this on an example with a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`.
-We will load a cohort of 5 subjects with variants in *ANKRD11*, leading to KBG syndrome. 
-The the clinical signs and symptoms of the subjects were encoded into HPO terms 
-along with the pathogenic *ANKRD11* variant.
++------------------------+-------------------------------------------------------------------------------------------------+
+| Primary data source    |   Example                                                                                       |
++========================+=================================================================================================+
+| Allele                 | the variant being a deletion or a single nucleotide variant (SNV)                               |
++------------------------+-------------------------------------------------------------------------------------------------+
+| Genome                 | overlaps of a target genomic region                                                             |
++------------------------+-------------------------------------------------------------------------------------------------+
+| Functional annotation  | variant is predicted to lead to a missense change or affect an exon of certain transcript       |
++------------------------+-------------------------------------------------------------------------------------------------+
+| Protein data           | variant is located in a region encoding a protein domain, protein feature type                  |
++------------------------+-------------------------------------------------------------------------------------------------+
+
+which demands a considerable amount of flexibility for creating the predicate.
+
+As a rule of thumb, the predicates for testing basic conditions are available off the shelf,
+and they can be used as building block for testing for more complex conditions,
+such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`".
+
+Let's demonstrate this on few examples.
+We will load a cohort of 19 subjects with variants in *RERE*,
+leading to `Holt-Oram syndrome MIM:142900 <https://omim.org/entry/142900>`_.
+The the clinical signs and symptoms of the subjects were encoded into HPO terms
+along with the pathogenic *RERE* variant.
 
 Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section.
 Briefly, we first load HPO:
@@ -36,7 +93,7 @@ then, we configure the cohort creator:
 
 which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets
 from the release `0.1.18` of `Phenopacket Store <https://github.com/monarch-initiative/phenopacket-store>`_.
-This time, however, we will load 19 individuals with mutations in *RERE* gene:
+We will load 19 individuals with mutations in *RERE* gene:
 
 >>> from ppktstore.registry import configure_phenopacket_registry
 >>> registry = configure_phenopacket_registry()
@@ -58,8 +115,8 @@ that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with
 >>> variant_key_of_interest = '1_8358231_8358231_T_C'
 >>> variant = cohort.get_variant_by_key(variant_key_of_interest)
 
-Simple predicates
-*****************
+Building blocks
+---------------
 
 We can check that the variant overlaps with *RERE*:
 
@@ -92,10 +149,10 @@ See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates`
 for more info on the predicates available off the shelf.
 
 
-Compound predicates
-*******************
+Complex conditions
+------------------
 
-The simple predicates can be combined to test for more elaborate conditions.
+We can combine the building blocks to test for more elaborate conditions.
 For instance, we can test if the variant meets *any* or several conditions:
 
 >>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id)
@@ -121,8 +178,8 @@ such as testing if the variant is a *"chromosomal deletion" or a deletion which
 '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))'
 
 
-Inverted predicate
-******************
+Inverting conditions
+--------------------
 
 Sometimes we may want to test the variant for a condition that must *not* be met.
 For instance, we may want to test if the variant is a deletion 
@@ -151,9 +208,219 @@ This is how we can use the predicate inversion to build the predicate for non-fr
 
 Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question.
 
+The variant predicate offers a flexible API for testing if variants meet a condition.
+However, the genotype phenotype correlations are done on the individual level
+and the variant predicates are used as a component of the genotype predicate.
+The next sections show how to use variant predicates to assign individuals into groups.
+
+
+Mode of inheritance predicate
+=============================
+
+The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate`
+assigns the individual into a group based on the number of alleles
+that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`.
+The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports
+the following Mendelian modes of inheritance (MoI):
+
+
++-----------------------+-----------------------------------+------------------+------------------------+
+|  Mode of inheritance  | Sex                               |   Allele count   |  Genotype category     |
++=======================+===================================+==================+========================+
+|  Autosomal dominant   | `*`                               |   0              |  `HOM_REF`             |
++                       +-----------------------------------+------------------+------------------------+
+|                       | `*`                               |   1              |  `HET`                 |
++                       +-----------------------------------+------------------+------------------------+
+|                       | `*`                               |   :math:`\ge 2`  |  ``None``              |
++-----------------------+-----------------------------------+------------------+------------------------+
+|  Autosomal recessive  | `*`                               |   0              |  `HOM_REF`             |
++                       +-----------------------------------+------------------+------------------------+
+|                       | `*`                               |   1              |  `HET`                 |
++                       +-----------------------------------+------------------+------------------------+
+|                       | `*`                               |   2              |  `BIALLELIC_ALT`       |
++                       +-----------------------------------+------------------+------------------------+
+|                       | `*`                               |   :math:`\ge 3`  |  ``None``              |
++-----------------------+-----------------------------------+------------------+------------------------+
+|  X-linked dominant    | `*`                               |   0              |  `HOM_REF`             |
++                       +-----------------------------------+------------------+------------------------+
+|                       | `*`                               |   1              |  `HET`                 |
++                       +-----------------------------------+------------------+------------------------+
+|                       | `*`                               |   :math:`\ge 2`  |  ``None``              |
++-----------------------+-----------------------------------+------------------+------------------------+
+|  X-linked recessive   | `*`                               |   0              |  `HOM_REF`             |
++                       +-----------------------------------+------------------+------------------------+
+|                       | :class:`~gpsea.model.Sex.FEMALE`  |   1              |  `HET`                 |
++                       +                                   +------------------+------------------------+
+|                       |                                   |   2              |  `BIALLELIC_ALT`       |
++                       +                                   +------------------+------------------------+
+|                       |                                   |   :math:`\ge 3`  |  ``None``              |
++                       +-----------------------------------+------------------+------------------------+
+|                       | :class:`~gpsea.model.Sex.MALE`    |   1              |  `HEMI`                |
++                       +                                   +------------------+------------------------+
+|                       |                                   |   :math:`\ge 2`  |  ``None``              |
++-----------------------+-----------------------------------+------------------+------------------------+
+
+.. note::
+
+    `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes.
+
+Clinical judgment should be used to choose the MoI for the cohort analysis.
+Then a predicate for the desired MoI can be created by one of 
+:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors:
+
+* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant`
+* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive`
+* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant`
+* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive`
+
+All constructors take an instance
+of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument.
+
+
+Example
+-------
+
+Here we show seting up a predicate for grouping individuals based on
+having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5``
+to test differences between the genotypes of a disease with an autosomal recessive MoI.
+
+First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`
+for testing if a variant meets the condition:
+
+>>> from gpsea.model import VariantEffect
+>>> from gpsea.analysis.predicate.genotype import VariantPredicates
+>>> tx_id = 'NM_1234.5'
+>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \
+...     | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id)
+>>> is_frameshift_or_stop_gain.get_question()
+'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)'
+
+Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive`
+for assigning a patient into a genotype group:
+
+>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
+>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain)
+>>> gt_predicate.get_question()
+'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT'
+
+The `gt_predicate` can be used in downstream analysis, such as in :class:
+
+
+Groups predicate
+================
+
+Sometimes, all we want is to compare if there is a difference between individuals
+who include one or more alleles of variant $X$ vs. individuals with variants $Y$,
+vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates.
+We can do this with a *groups* predicate.
+
+The :func:`~gpsea.analysis.predicate.genotype.groups_predicate`
+takes *n* variant predicates and *n* group labels, and it will assign the patients
+into the respective groups if one or more matching allele is found.
+However, only one predicate is allowed to return a non-zero allele count.
+Otherwise, the patient is assigned with ``None`` and excluded from the analysis.
+
+Example
+-------
+
+Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`
+for testing if the individual has at least one missense vs. frameshift vs. synonymous variant.
+
+>>> from gpsea.model import VariantEffect
+>>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate
+>>> tx_id = 'NM_1234.5'
+>>> gt_predicate = groups_predicate(
+...     predicates=(
+...         VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id),
+...         VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id),
+...         VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id),
+...     ),
+...     group_names=('Missense', 'Frameshift', 'Synonymous'),
+... )
+>>> gt_predicate.get_question()
+'Genotype group: Missense, Frameshift, Synonymous'
+
+
+.. _phenotype-predicates:
+
+********************
+Phenotype predicates
+********************
+
+The phenotype predicate assigns the individual into a group with respect to tested phenotype.
+Typically, the phenotype corresponds to a clinical sign or symptom encoded into an HPO term.
+
+
+Propagating phenotype predicate
+===============================
+
+When testing for presence or absence of an HPO term, the propagating phenotype predicate
+leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy.
+In result, an individual annotated with a term is implicitly annotated with all its ancestors.
+For instance, an individual annotated with `Ectopia lentis <https://hpo.jax.org/browse/term/HP:0001083>`_
+is also annotated with `Abnormal lens morphology <https://hpo.jax.org/browse/term/HP:0000517>`_,
+`Abnormal anterior eye segment morphology <https://hpo.jax.org/browse/term/HP:0004328>`_,
+`Abnormal eye morphology <https://hpo.jax.org/browse/term/HP:0012372>`_, ...
+
+Similarly, all descendants of a term, whose presence was specifically excluded in an individual,
+are implicitly excluded.
+
+:class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` implements this logic.
+
+Example
+-------
+
+Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate`
+to test for a presence of `Abnormal lens morphology <https://hpo.jax.org/browse/term/HP:0000517>`_.
+
+
+>>> from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate
+>>> query = hpotk.TermId.from_curie('HP:0000517')
+>>> pheno_predicate = PropagatingPhenotypePredicate(
+...     hpo=hpo,
+...     query=query,
+... )
+>>> pheno_predicate.get_question()
+'Is Abnormal lens morphology present in the patient?'
+
+
+TODO: explain ``missing_implies_phenotype_excluded``
+
+
+Predicates for all cohort phenotypes
+====================================
+
+Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious.
+The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest`
+function cuts down the tedium:
+
+>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest
+>>> pheno_predicates = prepare_predicates_for_terms_of_interest(
+...     cohort=cohort,
+...     hpo=hpo,
+... )
+>>> len(pheno_predicates)
+301
+
+and prepares predicates for testing 301 HPO terms of the *RERE* cohort.
+
+
+*******
+Gallery
+*******
+
+Here we show examples of predicates used in some of our analyses.
+
+TODO
+
+
+**********
+Need more?
+**********
 
-That's it for predicates. Please see :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` 
+Please see :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` 
 and :class:`~gpsea.analysis.predicate.genotype.ProteinPredicates` 
-for a comprehensive list of the predicates available off the shelf.
+for a list of the predicates available off the shelf.
 
-Please open an issue on our `GitHub tracker <https://github.com/monarch-initiative/gpsea/issues>`_ if a predicate seems to be missing.
+However, feel free to open an issue on our `GitHub tracker <https://github.com/monarch-initiative/gpsea/issues>`_
+if a predicate seems to be missing.
diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv
index 6abe64be2..a96700cfb 100644
--- a/docs/user-guide/report/tbx5_frameshift.csv
+++ b/docs/user-guide/report/tbx5_frameshift.csv
@@ -1,262 +1,22 @@
-FRAMESHIFT_VARIANT on NM_181486.4,Yes,Yes,No,No,,
+"Which genotype group does the patient fit in: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,,
 ,Count,Percent,Count,Percent,Corrected p values,p values
-Ventricular septal defect [HP:0001629],19/19,100%,42/71,59%,0.004112753923262261,0.00024192670136836828
-Abnormal atrioventricular conduction [HP:0005150],3/3,100%,1/23,4%,0.013076923076923076,0.0015384615384615385
-Absent thumb [HP:0009777],14/31,45%,18/100,18%,0.021044138590779585,0.0037136715160199273
-Atrioventricular block [HP:0001678],2/2,100%,1/23,4%,0.034,0.010000000000000002
-Heart block [HP:0012722],2/2,100%,1/23,4%,0.034,0.010000000000000002
-Patent ductus arteriosus [HP:0001643],2/2,100%,6/40,15%,0.09214092140921408,0.03252032520325203
-Secundum atrial septal defect [HP:0001684],4/22,18%,23/55,42%,0.1440020479198931,0.06544319142266644
-Triphalangeal thumb [HP:0001199],13/32,41%,23/99,23%,0.1440020479198931,0.06932119159387057
-Cardiac conduction abnormality [HP:0031546],3/3,100%,15/37,41%,0.1440020479198931,0.08259109311740892
-Muscular ventricular septal defect [HP:0011623],6/25,24%,8/84,10%,0.1440020479198931,0.08470708701170182
-Pulmonary arterial hypertension [HP:0002092],0/2,0%,8/14,57%,0.6899307928951144,0.4666666666666667
-Short thumb [HP:0009778],8/30,27%,25/69,36%,0.6899307928951144,0.4870099714553749
-Absent radius [HP:0003974],6/25,24%,9/43,21%,1.0,0.7703831604944444
-Hypoplasia of the radius [HP:0002984],6/14,43%,34/75,45%,1.0,1.0
-Atrial septal defect [HP:0001631],20/20,100%,63/65,97%,1.0,1.0
-Hypoplasia of the ulna [HP:0003022],2/10,20%,3/17,18%,1.0,1.0
-Short humerus [HP:0005792],4/9,44%,8/21,38%,1.0,1.0
-Abnormal ventricular septum morphology [HP:0010438],19/19,100%,42/42,100%,,
-Abnormal cardiac ventricle morphology [HP:0001713],19/19,100%,43/43,100%,,
-Abnormal heart morphology [HP:0001627],30/30,100%,89/89,100%,,
-Abnormal cardiovascular system morphology [HP:0030680],30/30,100%,92/92,100%,,
-Abnormality of the cardiovascular system [HP:0001626],32/32,100%,94/94,100%,,
-Phenotypic abnormality [HP:0000118],38/38,100%,114/114,100%,,
-All [HP:0000001],38/38,100%,114/114,100%,,
-Abnormal cardiac septum morphology [HP:0001671],28/28,100%,89/89,100%,,
-Forearm undergrowth [HP:0009821],7/7,100%,35/35,100%,,
-Abnormal upper limb bone morphology [HP:0040070],14/14,100%,50/50,100%,,
-Abnormality of the upper limb [HP:0002817],34/34,100%,102/102,100%,,
-Abnormality of limbs [HP:0040064],34/34,100%,102/102,100%,,
-Abnormal limb bone morphology [HP:0002813],34/34,100%,92/92,100%,,
-Abnormality of limb bone [HP:0040068],34/34,100%,92/92,100%,,
-Abnormality of the skeletal system [HP:0000924],35/35,100%,103/103,100%,,
-Abnormality of the musculoskeletal system [HP:0033127],35/35,100%,104/104,100%,,
-Abnormal appendicular skeleton morphology [HP:0011844],34/34,100%,93/93,100%,,
-Abnormal skeletal morphology [HP:0011842],35/35,100%,103/103,100%,,
-Upper limb undergrowth [HP:0009824],7/7,100%,38/38,100%,,
-Limb undergrowth [HP:0009826],7/7,100%,38/38,100%,,
-Aplasia/hypoplasia of the extremities [HP:0009815],22/22,100%,78/78,100%,,
-Aplasia/hypoplasia involving the skeleton [HP:0009115],23/23,100%,80/80,100%,,
-Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],22/22,100%,78/78,100%,,
-Aplasia/hypoplasia involving bones of the extremities [HP:0045060],22/22,100%,78/78,100%,,
-Aplasia/hypoplasia involving forearm bones [HP:0006503],12/12,100%,43/43,100%,,
-Abnormal forearm bone morphology [HP:0040072],14/14,100%,43/43,100%,,
-Abnormal forearm morphology [HP:0002973],14/14,100%,43/43,100%,,
-Short long bone [HP:0003026],9/9,100%,41/41,100%,,
-Abnormal long bone morphology [HP:0011314],13/13,100%,50/50,100%,,
-Aplasia/Hypoplasia of the radius [HP:0006501],11/11,100%,43/43,100%,,
-Abnormal morphology of the radius [HP:0002818],13/13,100%,43/43,100%,,
-1-2 finger syndactyly [HP:0010704],1/1,100%,4/4,100%,,
-Finger syndactyly [HP:0006101],2/2,100%,5/5,100%,,
-Syndactyly [HP:0001159],2/2,100%,5/5,100%,,
-Abnormal digit morphology [HP:0011297],33/33,100%,67/67,100%,,
-Abnormal atrial septum morphology [HP:0011994],20/20,100%,64/64,100%,,
-Abnormal cardiac atrium morphology [HP:0005120],20/20,100%,64/64,100%,,
-Perimembranous ventricular septal defect [HP:0011682],3/25,12%,6/84,7%,,
-Aplasia/Hypoplasia of the thumb [HP:0009601],19/19,100%,40/40,100%,,
-Aplasia/Hypoplasia of fingers [HP:0006265],19/19,100%,44/44,100%,,
-Aplasia/hypoplasia involving bones of the hand [HP:0005927],19/19,100%,44/44,100%,,
-Abnormal hand morphology [HP:0005922],20/20,100%,75/75,100%,,
-Abnormality of the hand [HP:0001155],31/31,100%,88/88,100%,,
-Abnormal finger morphology [HP:0001167],31/31,100%,64/64,100%,,
-Abnormal thumb morphology [HP:0001172],31/31,100%,58/58,100%,,
-Finger aplasia [HP:0009380],14/14,100%,23/23,100%,,
-Aplasia involving forearm bones [HP:0009822],6/6,100%,9/9,100%,,
-Absent forearm bone [HP:0003953],6/6,100%,9/9,100%,,
-Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,4/4,100%,,
-Abnormal morphology of ulna [HP:0040071],4/4,100%,4/4,100%,,
-Abnormal carpal morphology [HP:0001191],0/0,0%,30/32,94%,,
-Abnormality of the wrist [HP:0003019],0/0,0%,30/30,100%,,
-Abnormality of upper limb joint [HP:0009810],6/6,100%,32/32,100%,,
-Abnormal joint morphology [HP:0001367],6/6,100%,33/33,100%,,
-Abnormality of cardiovascular system electrophysiology [HP:0030956],3/3,100%,18/18,100%,,
-Abnormal cardiovascular system physiology [HP:0011025],5/5,100%,30/30,100%,,
-Abnormality of thumb phalanx [HP:0009602],13/13,100%,26/26,100%,,
-Upper limb phocomelia [HP:0009813],2/37,5%,8/116,7%,,
-Phocomelia [HP:0009829],2/2,100%,8/8,100%,,
-Short finger [HP:0009381],8/8,100%,27/27,100%,,
-Short digit [HP:0011927],10/10,100%,28/28,100%,,
-Pre-capillary pulmonary hypertension [HP:0033578],0/0,0%,8/8,100%,,
-Elevated pulmonary artery pressure [HP:0004890],0/0,0%,8/8,100%,,
-Abnormality of pulmonary circulation [HP:0030875],0/0,0%,8/8,100%,,
-Abnormal vascular physiology [HP:0030163],0/0,0%,8/8,100%,,
-Abnormality of the vasculature [HP:0002597],2/2,100%,17/17,100%,,
-Abnormal respiratory system physiology [HP:0002795],0/0,0%,8/8,100%,,
-Abnormality of the respiratory system [HP:0002086],0/0,0%,8/8,100%,,
-Congenital malformation of the great arteries [HP:0011603],2/2,100%,7/7,100%,,
-Abnormal morphology of the great vessels [HP:0030962],2/2,100%,10/10,100%,,
-Abnormal blood vessel morphology [HP:0033353],2/2,100%,11/11,100%,,
-Abnormal vascular morphology [HP:0025015],2/2,100%,11/11,100%,,
-Patent foramen ovale [HP:0001655],0/36,0%,4/69,6%,,
-Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,,
-Abnormality of joint mobility [HP:0011729],5/5,100%,3/3,100%,,
-Abnormal joint physiology [HP:0034430],5/5,100%,3/3,100%,,
-Abnormal musculoskeletal physiology [HP:0011843],5/5,100%,3/3,100%,,
-Abnormality of the vertebral column [HP:0000925],1/1,100%,4/4,100%,,
-Abnormal axial skeleton morphology [HP:0009121],5/5,100%,9/9,100%,,
-Complete atrioventricular canal defect [HP:0001674],3/36,8%,6/67,9%,,
-Atrioventricular canal defect [HP:0006695],3/3,100%,6/6,100%,,
-Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,6/6,100%,,
-Shoulder muscle hypoplasia [HP:0008952],0/0,0%,6/6,100%,,
-Hypoplasia of the musculature [HP:0009004],0/0,0%,6/6,100%,,
-Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,6/6,100%,,
-Abnormality of muscle size [HP:0030236],0/0,0%,6/6,100%,,
-Abnormal skeletal muscle morphology [HP:0011805],0/0,0%,8/8,100%,,
-Abnormality of the musculature [HP:0003011],0/0,0%,8/8,100%,,
-Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,6/6,100%,,
-Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,6/6,100%,,
-Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,6/6,100%,,
-Abnormality of the musculature of the limbs [HP:0009127],0/0,0%,8/8,100%,,
-Abnormality of the musculature of the upper limbs [HP:0001446],0/0,0%,8/8,100%,,
-Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,6/6,100%,,
-Bowed forearm bones [HP:0003956],1/1,100%,0/0,0%,,
-Bowing of the arm [HP:0006488],1/1,100%,0/0,0%,,
-Bowing of the long bones [HP:0006487],1/1,100%,0/0,0%,,
-Abnormal diaphysis morphology [HP:0000940],1/1,100%,0/0,0%,,
-Micrognathia [HP:0000347],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,,
-Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,3/3,100%,,
-Abnormal skull morphology [HP:0000929],2/2,100%,1/1,100%,,
-Abnormality of the head [HP:0000234],2/2,100%,5/5,100%,,
-Abnormality of head or neck [HP:0000152],2/2,100%,5/5,100%,,
-Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,,
-Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,,
-Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,,
-Abnormal oral cavity morphology [HP:0000163],1/1,100%,5/5,100%,,
-Abnormal oral morphology [HP:0031816],1/1,100%,5/5,100%,,
-Abnormality of the mouth [HP:0000153],1/1,100%,5/5,100%,,
-Abnormality of the face [HP:0000271],1/1,100%,5/5,100%,,
-Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],2/2,100%,2/2,100%,,
-Abnormal thorax morphology [HP:0000765],5/5,100%,7/7,100%,,
-Persistent left superior vena cava [HP:0005301],0/0,0%,4/39,10%,,
-Abnormal superior vena cava morphology [HP:0025575],0/0,0%,4/4,100%,,
-Abnormal vena cava morphology [HP:0005345],0/0,0%,4/4,100%,,
-Abnormal venous morphology [HP:0002624],0/0,0%,4/4,100%,,
-Aplasia/hypoplasia of the humerus [HP:0006507],4/4,100%,8/8,100%,,
-Abnormality of the humerus [HP:0003063],4/4,100%,8/8,100%,,
-Abnormality of the upper arm [HP:0001454],4/4,100%,8/8,100%,,
-Right atrial enlargement [HP:0030718],0/0,0%,4/4,100%,,
-Abnormal right atrium morphology [HP:0025580],0/0,0%,4/4,100%,,
-Atrial septal dilatation [HP:0011995],0/0,0%,4/4,100%,,
-Left ventricular noncompaction cardiomyopathy [HP:0011664],1/5,20%,1/7,14%,,
-Noncompaction cardiomyopathy [HP:0012817],1/1,100%,1/1,100%,,
-Cardiomyopathy [HP:0001638],1/1,100%,1/1,100%,,
-Abnormal myocardium morphology [HP:0001637],1/1,100%,1/1,100%,,
-Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,,
-Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,,
-Abnormal toe morphology [HP:0001780],0/0,0%,1/1,100%,,
-Abnormal foot morphology [HP:0001760],0/0,0%,4/4,100%,,
-Abnormality of the lower limb [HP:0002814],0/0,0%,4/4,100%,,
-Abnormal toe phalanx morphology [HP:0010161],0/0,0%,1/1,100%,,
-Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,1/1,100%,,
-Abnormal lower limb bone morphology [HP:0040069],0/0,0%,4/4,100%,,
-Aplasia involving bones of the extremities [HP:0009825],0/0,0%,3/3,100%,,
-Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,3/3,100%,,
-Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,3/3,100%,,
-Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,3/3,100%,,
-Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,3/3,100%,,
-Abnormal finger phalanx morphology [HP:0005918],0/0,0%,9/9,100%,,
-Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,6/6,100%,,
-Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,3/3,100%,,
-Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,3/3,100%,,
-Aplasia of metacarpal bones [HP:0010048],0/0,0%,3/3,100%,,
-Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,4/4,100%,,
-Abnormal metacarpal morphology [HP:0005916],0/0,0%,4/4,100%,,
-Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,4/4,100%,,
-Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,4/4,100%,,
-Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,4/4,100%,,
-Partial absence of thumb [HP:0009659],0/0,0%,3/3,100%,,
-Limited pronation/supination of forearm [HP:0006394],3/3,100%,2/2,100%,,
-Limited elbow movement [HP:0002996],4/4,100%,2/2,100%,,
-Abnormality of the elbow [HP:0009811],5/5,100%,2/2,100%,,
-Limitation of joint mobility [HP:0001376],4/4,100%,2/2,100%,,
-Mitral regurgitation [HP:0001653],2/2,100%,2/2,100%,,
-Atrioventricular valve regurgitation [HP:0034376],2/2,100%,7/7,100%,,
-Abnormal atrioventricular valve physiology [HP:0031650],2/2,100%,7/7,100%,,
-Abnormal heart valve physiology [HP:0031653],2/2,100%,7/7,100%,,
-Abnormal mitral valve physiology [HP:0031481],2/2,100%,2/2,100%,,
-Tricuspid regurgitation [HP:0005180],0/0,0%,5/5,100%,,
-Abnormal tricuspid valve physiology [HP:0031651],0/0,0%,5/5,100%,,
-Cleft soft palate [HP:0000185],0/0,0%,2/2,100%,,
-Abnormal soft palate morphology [HP:0100736],0/0,0%,2/2,100%,,
-Abnormal palate morphology [HP:0000174],0/0,0%,5/5,100%,,
-Cleft palate [HP:0000175],0/0,0%,2/2,100%,,
-Orofacial cleft [HP:0000202],0/0,0%,2/2,100%,,
-Craniofacial cleft [HP:5201015],0/0,0%,2/2,100%,,
-Sinus bradycardia [HP:0001688],1/1,100%,2/2,100%,,
-Abnormal electrophysiology of sinoatrial node origin [HP:0011702],1/1,100%,2/2,100%,,
-Arrhythmia [HP:0011675],1/1,100%,3/3,100%,,
-Bradycardia [HP:0001662],1/1,100%,2/2,100%,,
-Pectus excavatum [HP:0000767],2/2,100%,3/4,75%,,
-Abnormal sternum morphology [HP:0000766],2/2,100%,3/3,100%,,
-Deviation of the thumb [HP:0009603],2/2,100%,3/3,100%,,
-Deviation of finger [HP:0004097],2/2,100%,4/4,100%,,
-Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,5/5,100%,,
-Abnormal skin morphology [HP:0011121],1/1,100%,0/0,0%,,
-Abnormality of the skin [HP:0000951],1/1,100%,0/0,0%,,
-Abnormality of the integument [HP:0001574],1/1,100%,0/0,0%,,
-Sinus venosus atrial septal defect [HP:0011567],1/1,100%,0/2,0%,,
-First degree atrioventricular block [HP:0011705],1/1,100%,1/23,4%,,
-Postaxial hand polydactyly [HP:0001162],0/0,0%,3/4,75%,,
-Postaxial polydactyly [HP:0100259],0/0,0%,3/3,100%,,
-Polydactyly [HP:0010442],0/0,0%,3/3,100%,,
-Abnormal 5th finger morphology [HP:0004207],0/0,0%,6/6,100%,,
-Hand polydactyly [HP:0001161],0/0,0%,3/3,100%,,
-Duplication of phalanx of hand [HP:0009997],0/0,0%,3/3,100%,,
-Duplication of hand bones [HP:0004275],0/0,0%,3/3,100%,,
-Duplication of bones involving the upper extremities [HP:0009142],0/0,0%,3/3,100%,,
-High palate [HP:0000218],0/0,0%,3/3,100%,,
-Short neck [HP:0000470],0/0,0%,3/3,100%,,
-Abnormal neck morphology [HP:0025668],0/0,0%,3/3,100%,,
-Abnormality of the neck [HP:0000464],0/0,0%,3/3,100%,,
-Abnormality of the cervical spine [HP:0003319],0/0,0%,3/3,100%,,
-Shield chest [HP:0000914],0/0,0%,3/3,100%,,
-Enlarged thorax [HP:0100625],0/0,0%,3/3,100%,,
-Abnormal rib cage morphology [HP:0001547],0/0,0%,5/5,100%,,
-Y-shaped metatarsals [HP:0010567],0/0,0%,3/3,100%,,
-Abnormal metatarsal morphology [HP:0001832],0/0,0%,3/3,100%,,
-11 pairs of ribs [HP:0000878],0/0,0%,2/2,100%,,
-Missing ribs [HP:0000921],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the ribs [HP:0006712],0/0,0%,2/2,100%,,
-Abnormal rib morphology [HP:0000772],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the 2nd finger [HP:0006264],0/0,0%,2/2,100%,,
-Abnormal 2nd finger morphology [HP:0004100],0/0,0%,2/2,100%,,
-Small hypothenar eminence [HP:0010487],0/0,0%,2/2,100%,,
-Abnormality of the hypothenar eminence [HP:0010486],0/0,0%,2/2,100%,,
-Abnormality of the musculature of the hand [HP:0001421],0/0,0%,2/2,100%,,
-Upper extremity joint dislocation [HP:0030310],2/2,100%,0/0,0%,,
-Joint dislocation [HP:0001373],2/2,100%,0/0,0%,,
-Amelia involving the upper limbs [HP:0009812],1/37,3%,0/114,0%,,
-Third degree atrioventricular block [HP:0001709],1/1,100%,0/22,0%,,
-Atrioventricular dissociation [HP:0011709],1/1,100%,0/22,0%,,
-Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,,
-Proximal placement of thumb [HP:0009623],0/0,0%,3/3,100%,,
-Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,2/2,100%,,
-Type A brachydactyly [HP:0009370],0/0,0%,2/2,100%,,
-Brachydactyly [HP:0001156],0/0,0%,2/2,100%,,
-Short 5th finger [HP:0009237],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,2/2,100%,,
-Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,2/2,100%,,
-Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,2/2,100%,,
-Short middle phalanx of finger [HP:0005819],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,2/2,100%,,
-Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,2/2,100%,,
-Short phalanx of finger [HP:0009803],0/0,0%,2/2,100%,,
-Common atrium [HP:0011565],0/38,0%,1/115,1%,,
-Unroofed coronary sinus [HP:0031297],0/38,0%,1/117,1%,,
-Clinodactyly of the 5th finger [HP:0004209],0/0,0%,2/2,100%,,
-Finger clinodactyly [HP:0040019],0/0,0%,2/2,100%,,
-Clinodactyly [HP:0030084],0/0,0%,2/2,100%,,
-Deviation of the 5th finger [HP:0009179],0/0,0%,2/2,100%,,
-Mitral valve prolapse [HP:0001634],1/1,100%,1/1,100%,,
-Abnormal mitral valve morphology [HP:0001633],1/1,100%,1/1,100%,,
-Abnormal atrioventricular valve morphology [HP:0006705],1/1,100%,1/1,100%,,
-Abnormal heart valve morphology [HP:0001654],1/1,100%,1/1,100%,,
-Short 1st metacarpal [HP:0010034],0/22,0%,1/45,2%,,
-Short phalanx of the thumb [HP:0009660],0/22,0%,1/45,2%,,
+Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825
+Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387
+Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.021044138590779585,0.003713671516019927
+Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.034,0.01
+Heart block [HP:0012722],1/23,4%,2/2,100%,0.034,0.01
+Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.09214092140921408,0.03252032520325203
+Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.1440020479198931,0.06544319142266644
+Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.1440020479198931,0.06932119159387057
+Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.1440020479198931,0.08259109311740892
+Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.1440020479198931,0.08470708701170182
+Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.6899307928951143,0.4666666666666667
+Short thumb [HP:0009778],25/69,36%,8/30,27%,0.6899307928951143,0.48700997145537483
+Absent radius [HP:0003974],9/43,21%,6/25,24%,1.0,0.7703831604944444
+Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0
+Hypoplasia of the radius [HP:0002984],34/75,45%,6/14,43%,1.0,1.0
+Hypoplasia of the ulna [HP:0003022],3/17,18%,2/10,20%,1.0,1.0
+Short humerus [HP:0005792],8/21,38%,4/9,44%,1.0,1.0
+Abnormal atrial septum morphology [HP:0011994],64/64,100%,20/20,100%,,
+Abnormal cardiac septum morphology [HP:0001671],89/89,100%,28/28,100%,,
+Abnormal heart morphology [HP:0001627],89/89,100%,30/30,100%,,
diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst
index 3eeca2bfc..96e9376e0 100644
--- a/docs/user-guide/stats.rst
+++ b/docs/user-guide/stats.rst
@@ -122,13 +122,34 @@ We want to separate the patients into two groups: a group *with* a frameshift va
 and a group *without* a frameshift variant, based on the functional annotation.
 We will use the *MANE* transcript for the analysis:
 
+Building a genotype predicate is a two step process. 
+First, we create a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`
+to test if the variant leads to a frameshift (in this case):
+
 >>> from gpsea.model import VariantEffect
 >>> from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate
 >>> tx_id = 'NM_181486.4'
->>> gt_predicate = boolean_predicate(VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id))
->>> gt_predicate.get_question()
+>>> is_frameshift = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id)
+>>> is_frameshift.get_question()
 'FRAMESHIFT_VARIANT on NM_181486.4'
 
+and then we choose the expected mode of inheritance to test. In case of *TBX5*,
+we expect the autosomal dominant mode of inheritance:
+
+>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
+>>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift)
+>>> gt_predicate.get_question()
+'Which genotype group does the patient fit in: HOM_REF, HET'
+
+`gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group
+and the patients with one frameshift allele will be assigned into `HET` group.
+Note, any patient with 2 or more alleles will be *omitted* from the analysis.
+
+.. note::
+
+   Mode of inheritance testing is not the only way to dissect by a genotype.
+   See the :ref:`genotype-predicates` section for more info.
+
 
 **Phenotype predicates**
 
@@ -229,18 +250,32 @@ We can learn more by showing the MTC filter report:
 Genotype phenotype associations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Last, let's explore the associations. This is a table of the tested HPO terms
-ordered by the corrected p value (Benjamini-Hochberg FDR):
+Last, let's explore the associations. The results include a table with all tested HPO terms
+ordered by the corrected p value (Benjamini-Hochberg FDR).
+Here we show the top 20 table rows:
 
 >>> from gpsea.analysis.predicate import PatientCategories
 >>> summary_df = result.summarize(hpo, PatientCategories.YES)
->>> summary_df.to_csv('docs/user-guide/report/tbx5_frameshift.csv')  # doctest: +SKIP
+>>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv')  # doctest: +SKIP
 
 .. csv-table:: *TBX5* frameshift vs rest
    :file: report/tbx5_frameshift.csv
    :header-rows: 2
 
 
+The table shows that several HPO terms are significantly associated
+with presence of a heterozygous (`HET`) frameshift variant in *TBX5*.
+For example, `Ventricular septal defect <https://hpo.jax.org/browse/term/HP:0001629>`_
+was observed in 31/60 (52%) patients with a missense variant
+but it was observed in 19/19 (100%) patients with a frameshift variant.
+Fisher exact test computed a p value of `~0.000242`
+and the p value corrected by Benjamini-Hochberg procedure
+is `~0.00411`.
+
+The table includes all HPO terms of the cohort, including the terms that were not selected for testing
+and thus have no associated p value.
+
+
 .. _phenotype-score-stats:
 
 ***************
diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index 1cd1a6577..a032aae95 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -273,24 +273,42 @@ class MendelianInheritanceAspect(enum.Enum):
 
 class ModeOfInheritanceInfo:
 
+    # NOT PART OF THE PUBLIC API!!!
+
+    HOM_REF = Categorization(
+        PatientCategory(
+            cat_id=0, name="HOM_REF", description="Homozygous reference",
+        ),
+    )
+    HET = Categorization(
+        PatientCategory(
+            cat_id=1, name="HET", description="Heterozygous",
+        ),
+    )
+    BIALLELIC_ALT = Categorization(
+        PatientCategory(
+            cat_id=2, name="BIALLELIC_ALT",
+            description="Homozygous alternate or compound heterozygous",
+        ),
+    )
+    HEMI = Categorization(
+        PatientCategory(
+            cat_id=3, name="HEMI", description="Hemizygous",
+        ),
+    )
+
     @staticmethod
     def autosomal_dominant() -> "ModeOfInheritanceInfo":
         groups = (
             GenotypeGroup(
                 allele_count=0,
                 sex=None,
-                categorization=Categorization(
-                    PatientCategory(
-                        cat_id=0, name="0/0", description="Homozygous reference"
-                    ),
-                ),
+                categorization=ModeOfInheritanceInfo.HOM_REF,
             ),
             GenotypeGroup(
                 allele_count=1,
                 sex=None,
-                categorization=Categorization(
-                    PatientCategory(cat_id=1, name="0/1", description="Heterozygous"),
-                ),
+                categorization=ModeOfInheritanceInfo.HET,
             ),
         )
         return ModeOfInheritanceInfo(
@@ -304,25 +322,17 @@ def autosomal_recessive() -> "ModeOfInheritanceInfo":
             GenotypeGroup(
                 allele_count=0,
                 sex=None,
-                categorization=Categorization(
-                    PatientCategory(
-                        cat_id=0, name="0/0", description="Homozygous reference"
-                    ),
-                ),
+                categorization=ModeOfInheritanceInfo.HOM_REF,
             ),
             GenotypeGroup(
                 allele_count=1,
                 sex=None,
-                categorization=Categorization(
-                    PatientCategory(cat_id=1, name="0/1", description="Heterozygous"),
-                ),
+                categorization=ModeOfInheritanceInfo.HET,
             ),
             GenotypeGroup(
                 allele_count=2,
                 sex=None,
-                categorization=Categorization(
-                    PatientCategory(cat_id=2, name="1/1", description="Homozygous alternate"),
-                ),
+                categorization=ModeOfInheritanceInfo.BIALLELIC_ALT,
             ),
         )
         return ModeOfInheritanceInfo(
@@ -336,25 +346,12 @@ def x_dominant() -> "ModeOfInheritanceInfo":
             GenotypeGroup(
                 allele_count=0,
                 sex=None,
-                categorization=Categorization(
-                    PatientCategory(
-                        cat_id=0, name="0", description="Homozygous reference"
-                    ),
-                ),
-            ),
-            GenotypeGroup(
-                allele_count=1,
-                sex=Sex.FEMALE,
-                categorization=Categorization(
-                    PatientCategory(cat_id=1, name="0/1", description="Heterozygous"),
-                ),
+                categorization=ModeOfInheritanceInfo.HOM_REF,
             ),
             GenotypeGroup(
                 allele_count=1,
-                sex=Sex.MALE,
-                categorization=Categorization(
-                    PatientCategory(cat_id=2, name="1", description="Hemizygous"),
-                ),
+                sex=None,
+                categorization=ModeOfInheritanceInfo.HET,
             ),
         )
         return ModeOfInheritanceInfo(
@@ -368,25 +365,22 @@ def x_recessive() -> "ModeOfInheritanceInfo":
             GenotypeGroup(
                 allele_count=0,
                 sex=None,
-                categorization=Categorization(
-                    PatientCategory(
-                        cat_id=0, name="0/0", description="Homozygous reference"
-                    ),
-                ),
+                categorization=ModeOfInheritanceInfo.HOM_REF,
             ),
             GenotypeGroup(
                 allele_count=1,
                 sex=Sex.FEMALE,
-                categorization=Categorization(
-                    PatientCategory(cat_id=1, name="0/1", description="Heterozygous"),
-                ),
+                categorization=ModeOfInheritanceInfo.HET,
+            ),
+            GenotypeGroup(
+                allele_count=2,
+                sex=Sex.FEMALE,
+                categorization=ModeOfInheritanceInfo.BIALLELIC_ALT,
             ),
             GenotypeGroup(
                 allele_count=1,
                 sex=Sex.MALE,
-                categorization=Categorization(
-                    PatientCategory(cat_id=2, name="1", description="Hemizygous"),
-                ),
+                categorization=ModeOfInheritanceInfo.HEMI,
             ),
         )
 
@@ -400,7 +394,8 @@ def __init__(
         mendelian_inheritance_aspect: MendelianInheritanceAspect,
         groups: typing.Iterable[GenotypeGroup],
     ):
-        # We pre-compute the hash manually.
+        # We want this to be hashable but also keep a non-hashable dict
+        # as a field. Therefore, we pre-compute the hash manually.
         # The correctness depends on two default dicts with same keys and values
         # comparing equal.
         hash_value = 17
diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py
index 0d541a65d..09a1c913d 100644
--- a/src/gpsea/model/_base.py
+++ b/src/gpsea/model/_base.py
@@ -13,17 +13,17 @@ class Sex(enum.Enum):
     
     UNKNOWN_SEX = 0
     """
-    Not assessed or not available. Maps to `NCIT:C17998`.
+    Not assessed or not available. Maps to ``NCIT:C17998``.
     """
     
     FEMALE = 1
     """
-    Female sex. Maps to `NCIT:C46113`.
+    Female sex. Maps to ``NCIT:C46113``.
     """
 
     MALE = 2
     """
-    Male sex. Maps to `NCIT:C46112`.
+    Male sex. Maps to ``NCIT:C46112``.
     """
 
     def is_provided(self) -> bool:
diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py
index c44a214b8..7138e93ff 100644
--- a/tests/analysis/predicate/genotype/conftest.py
+++ b/tests/analysis/predicate/genotype/conftest.py
@@ -211,10 +211,10 @@ def patient_w_frameshift(
 
 
 """
-Genesis family - Autosomal dominant
+Genesis family - Autosomal dominant but can also be used as X dominant.
 
-* Adam - father, affected
-* Eve - mother, unaffected
+* Adam - father, unaffected
+* Eve - mother, affected
 * Cain - son, affected
 """
 
@@ -260,8 +260,8 @@ def genesis_mutation(
         ),
         genotypes=Genotypes.from_mapping(
             {
-                adam_label: Genotype.HETEROZYGOUS,
-                eve_label: Genotype.HOMOZYGOUS_REFERENCE,
+                adam_label: Genotype.HOMOZYGOUS_REFERENCE,
+                eve_label: Genotype.HETEROZYGOUS,
                 cain_label: Genotype.HETEROZYGOUS,
             }
         ),
@@ -597,8 +597,3 @@ def leia(
         diseases=(),
         variants=(skywalker_mutation,),
     )
-
-
-"""
-XR family
-"""
diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
index ac6977c8e..36904159f 100644
--- a/tests/analysis/predicate/genotype/test_gt_predicates.py
+++ b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -84,17 +84,16 @@ def variant_predicate(self) -> VariantPredicate:
         return VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID)
 
     @pytest.mark.parametrize(
-        "patient_name,cat_id,name",
+        "patient_name,name",
         [
-            ("adam", 1, "0/1"),
-            ("eve", 0, "0/0"),
-            ("cain", 1, "0/1"),
+            ("adam", "HOM_REF"),
+            ("eve", "HET"),
+            ("cain", "HET"),
         ],
     )
     def test_autosomal_dominant(
         self,
         patient_name: str,
-        cat_id: int,
         name: str,
         variant_predicate: VariantPredicate,
         request: pytest.FixtureRequest,
@@ -106,22 +105,20 @@ def test_autosomal_dominant(
 
         assert categorization is not None
 
-        assert categorization.category.cat_id == cat_id
         assert categorization.category.name == name
 
     @pytest.mark.parametrize(
-        "patient_name,cat_id,name",
+        "patient_name,name",
         [
-            ("walt", 1, "0/1"),
-            ("skyler", 1, "0/1"),
-            ("flynn", 2, "1/1"),
-            ("holly", 0, "0/0"),
+            ("walt", "HET"),
+            ("skyler", "HET"),
+            ("flynn", "BIALLELIC_ALT"),
+            ("holly", "HOM_REF"),
         ],
     )
     def test_autosomal_recessive(
         self,
         patient_name: str,
-        cat_id: int,
         name: str,
         variant_predicate: VariantPredicate,
         request: pytest.FixtureRequest,
@@ -133,22 +130,44 @@ def test_autosomal_recessive(
 
         assert categorization is not None
 
-        assert categorization.category.cat_id == cat_id
         assert categorization.category.name == name
 
     @pytest.mark.parametrize(
-        "patient_name,cat_id,name",
+        "patient_name,name",
         [
-            ("anakin", 0, "0/0"),
-            ("padme", 1, "0/1"),
-            ("luke", 2, "1"),
-            ("leia", 1, "0/1"),
+            ("adam", "HOM_REF"),
+            ("eve", "HET"),
+            ("cain", "HET"),
+        ],
+    )
+    def test_x_dominant(
+        self,
+        patient_name: str,
+        name: str,
+        variant_predicate: VariantPredicate,
+        request: pytest.FixtureRequest,
+    ):
+        patient = request.getfixturevalue(patient_name)
+        predicate = ModeOfInheritancePredicate.x_dominant(variant_predicate)
+
+        categorization = predicate.test(patient)
+
+        assert categorization is not None
+
+        assert categorization.category.name == name
+
+    @pytest.mark.parametrize(
+        "patient_name,name",
+        [
+            ("anakin", "HOM_REF"),
+            ("padme", "HET"),
+            ("luke", "HEMI"),
+            ("leia", "HET"),
         ],
     )
     def test_x_recessive(
         self,
         patient_name: str,
-        cat_id: int,
         name: str,
         variant_predicate: VariantPredicate,
         request: pytest.FixtureRequest,
@@ -160,5 +179,4 @@ def test_x_recessive(
 
         assert categorization is not None
 
-        assert categorization.category.cat_id == cat_id
         assert categorization.category.name == name

From 7acfc72dbaace44ad12e1107b869e47294b4ca08 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Sun, 1 Sep 2024 23:24:19 +0200
Subject: [PATCH 08/16] Check compatibility between the predicates and the
 count statistic.

---
 docs/tutorial.rst                             |  4 +-
 docs/user-guide/mtc.rst                       |  4 +-
 docs/user-guide/stats.rst                     |  4 +-
 src/gpsea/analysis/pcats/_impl.py             | 74 ++++++++++++++-
 src/gpsea/analysis/pcats/stats/__init__.py    |  4 +-
 src/gpsea/analysis/pcats/stats/_stats.py      | 90 ++++++++++++-------
 .../analysis/pcats/stats/_test__stats.py      |  9 +-
 tests/analysis/pcats/test_disease.py          |  4 +-
 .../analysis/pcats/test_hpo_term_analysis.py  |  4 +-
 tests/test_tutorial.py                        |  4 +-
 10 files changed, 147 insertions(+), 54 deletions(-)

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 8d1d47268..e047943ce 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -224,8 +224,8 @@ with a false discovery control level at (``mtc_alpha=0.05``):
 Choosing the statistical procedure for assessment of association between genotype and phenotype
 groups is the last missing piece of the analysis. We will use Fisher Exact Test:
 
->>> from gpsea.analysis.pcats.stats import ScipyFisherExact
->>> count_statistic = ScipyFisherExact()
+>>> from gpsea.analysis.pcats.stats import FisherExactTest
+>>> count_statistic = FisherExactTest()
 
 and we finalize the analysis setup by putting all components together
 into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst
index f8958b8ae..070539d19 100644
--- a/docs/user-guide/mtc.rst
+++ b/docs/user-guide/mtc.rst
@@ -96,9 +96,9 @@ when creating an instance of :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
 
 >>> from gpsea.analysis.mtc_filter import UseAllTermsMtcFilter
 >>> from gpsea.analysis.pcats import HpoTermAnalysis
->>> from gpsea.analysis.pcats.stats import ScipyFisherExact
+>>> from gpsea.analysis.pcats.stats import FisherExactTest
 >>> analysis = HpoTermAnalysis(
-...     count_statistic=ScipyFisherExact(),
+...     count_statistic=FisherExactTest(),
 ...     mtc_filter=UseAllTermsMtcFilter(),
 ...     mtc_correction='bonferroni',  #      <--- The MTC correction setup
 ... )
diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst
index 96e9376e0..16f0060b7 100644
--- a/docs/user-guide/stats.rst
+++ b/docs/user-guide/stats.rst
@@ -176,8 +176,8 @@ including the *indirect* annotations whose presence is implied by the true path
 We will use :ref:<fisher-exact-test> to test the association
 between genotype and phenotype groups, as described previously.
 
->>> from gpsea.analysis.pcats.stats import ScipyFisherExact
->>> count_statistic = ScipyFisherExact()
+>>> from gpsea.analysis.pcats.stats import FisherExactTest
+>>> count_statistic = FisherExactTest()
 
 FET will compute a p value for each genotype phenotype group.
 
diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
index e4513fa0b..6600dab46 100644
--- a/src/gpsea/analysis/pcats/_impl.py
+++ b/src/gpsea/analysis/pcats/_impl.py
@@ -1,5 +1,6 @@
 import abc
 import math
+import os
 import typing
 
 from collections import Counter
@@ -273,20 +274,87 @@ def __init__(
             (e.g. Bonferroni MTC) or false discovery rate for the FDR procedures (e.g. Benjamini-Hochberg).
         """
         assert isinstance(count_statistic, CountStatistic)
+        assert len(count_statistic.supports_shape) == 2, "The statistic must support 2D contingency tables"
         self._count_statistic = count_statistic
         self._mtc_correction = mtc_correction
         assert isinstance(mtc_alpha, float) and 0. <= mtc_alpha <= 1.
         self._mtc_alpha = mtc_alpha
 
-    @abc.abstractmethod
     def compare_genotype_vs_phenotypes(
         self,
         cohort: typing.Iterable[Patient],
         gt_predicate: GenotypePolyPredicate,
         pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
+    ) -> MultiPhenotypeAnalysisResult[P]:
+        # Check compatibility between the count statistic and predicate.
+        issues = MultiPhenotypeAnalysis._check_compatibility(
+            count_statistic=self._count_statistic,
+            gt_predicate=gt_predicate,
+            pheno_predicates=pheno_predicates,
+        )
+        if len(issues) != 0:
+            msg = os.linesep.join(issues)
+            raise ValueError(f'Cannot execute the analysis: {msg}')
+
+        return self._compute_result(
+            cohort=cohort,
+            gt_predicate=gt_predicate,
+            pheno_predicates=pheno_predicates,
+        )
+
+    @abc.abstractmethod
+    def _compute_result(
+        self,
+        cohort: typing.Iterable[Patient],
+        gt_predicate: GenotypePolyPredicate,
+        pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
     ) -> MultiPhenotypeAnalysisResult[P]:
         pass
 
+    @staticmethod
+    def _check_compatibility(
+        count_statistic: CountStatistic,
+        gt_predicate: GenotypePolyPredicate,
+        pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
+    ) -> typing.Collection[str]:
+        # There should be 2 items due to check in `__init__`.
+        (pheno, geno) = count_statistic.supports_shape
+
+        issues = []
+        # Check phenotype
+        if isinstance(pheno, int):
+            pheno_accepted = (pheno,)
+        elif isinstance(pheno, typing.Sequence):
+            pheno_accepted = pheno
+        else:
+            issues.append('Cannot use a count statistic that does not check phenotypes')
+
+        pheno_failed = []
+        for i, ph_predicate in enumerate(pheno_predicates):
+            if ph_predicate.n_categorizations not in pheno_accepted:
+                pheno_failed.append(i)
+        if len(pheno_failed) != 0:
+            issues.append(
+                'Phenotype predicates {} are incompatible with the count statistic'.format(
+                    ', '.join(str(i) for i in pheno_failed)
+                )
+            )
+
+        # Check genotype
+        if isinstance(geno, int):
+            geno_accepted = (geno,)
+        elif isinstance(geno, typing.Sequence):
+            geno_accepted = geno
+        elif pheno is None:
+            raise ValueError('Cannot use a count statistic that does not check genotypes')
+        else:
+            raise ValueError(f'Cannot use a count statistic that supports shape {pheno, geno}')
+        
+        if gt_predicate.n_categorizations not in geno_accepted:
+            issues.append('Genotype predicate is incompatible with the count statistic')
+        
+        return issues
+
     def _compute_nominal_pvals(
         self,
         n_usable: typing.Iterable[int],
@@ -380,7 +448,7 @@ def __hash__(self):
 
 class DiseaseAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]):
 
-    def compare_genotype_vs_phenotypes(
+    def _compute_result(
         self,
         cohort: typing.Iterable[Patient],
         gt_predicate: GenotypePolyPredicate,
@@ -513,7 +581,7 @@ def __init__(
         assert isinstance(mtc_filter, PhenotypeMtcFilter)
         self._mtc_filter = mtc_filter
 
-    def compare_genotype_vs_phenotypes(
+    def _compute_result(
         self,
         cohort: typing.Iterable[Patient],
         gt_predicate: GenotypePolyPredicate,
diff --git a/src/gpsea/analysis/pcats/stats/__init__.py b/src/gpsea/analysis/pcats/stats/__init__.py
index 4d7e05437..a535129d4 100644
--- a/src/gpsea/analysis/pcats/stats/__init__.py
+++ b/src/gpsea/analysis/pcats/stats/__init__.py
@@ -1,5 +1,5 @@
-from ._stats import CountStatistic, ScipyFisherExact, PythonMultiFisherExact
+from ._stats import CountStatistic, FisherExactTest
 
 __all__ = [
-    'CountStatistic', 'ScipyFisherExact', 'PythonMultiFisherExact',
+    'CountStatistic', 'FisherExactTest',
 ]
diff --git a/src/gpsea/analysis/pcats/stats/_stats.py b/src/gpsea/analysis/pcats/stats/_stats.py
index 8df07fdcd..981176ee1 100644
--- a/src/gpsea/analysis/pcats/stats/_stats.py
+++ b/src/gpsea/analysis/pcats/stats/_stats.py
@@ -1,5 +1,7 @@
 import abc
 import math
+import typing
+
 from decimal import Decimal
 
 
@@ -14,9 +16,45 @@ class CountStatistic(metaclass=abc.ABCMeta):
     `CountStatistic` calculates a p value for a contingency table
     produced by a pair of discrete random variables.
 
-    The `counts` table is usually `2x2` or `2x3`.
+    
+    Supports shape
+    ^^^^^^^^^^^^^^
+
+    `CountStatistic` takes the counts in form of a data frame,
+    and some statistics impose additional requirements on the frame shape.
+    For instance, GPSEA's implementation of the Fisher exact test
+    can compare counts in a ``(2, 2)`` or ``(2, 3)`` arrays
+    but χ2 test can test an ``(m, n)`` array.
+
+    It is important to check that a genotype/phenotype predicate produces
+    the number of groups which the statistic can test.
+
+    The :attr:`supports_shape` returns a sequence with requirements
+    on the shape of the data array/frame. The sequence includes
+    the number of
+
+    Examples
+    ********
+
+    +------------------------+-------------------------+------------------+
+    | Test                   | Array shape             | `supports_shape` |
+    +========================+=========================+==================+
+    | Fisher Exact Test      | ``(2, [2, 3])``         | ``(2, [2,3])``   |
+    +------------------------+-------------------------+------------------+
+    | χ2                     | ``(*, *)``              | ``(None, None)`` |
+    +------------------------+-------------------------+------------------+
     """
 
+    @property
+    @abc.abstractmethod
+    def supports_shape(
+        self,
+    ) -> typing.Sequence[typing.Union[int, typing.Sequence[int], None]]:
+        """
+        Get a sequence of the supported shapes.
+        """
+        pass
+
     @abc.abstractmethod
     def compute_pval(
         self,
@@ -25,47 +63,35 @@ def compute_pval(
         pass
 
 
-class ScipyFisherExact(CountStatistic):
+class FisherExactTest(CountStatistic):
     """
-    `ScipyFisherExact` performs Fisher Exact Test on a `2x2` contingency table.
+    `FisherExactTest` performs Fisher Exact Test on a `2x2` or `2x3` contingency table.
 
-    The class is a thin wrapper around Scipy :func:`~scipy.stats.fisher_exact` function.
-    The two-sided :math:`H_1` is considered.
+    The `2x2` version is a thin wrapper around Scipy :func:`~scipy.stats.fisher_exact` function,
+    while the `2x3` variant is implemented in Python.
+    In both variants, the two-sided :math:`H_1` is considered.
     """
+    
+    def __init__(self):
+        self._shape = (2, (2, 3))
 
-    def compute_pval(
+    @property
+    def supports_shape(
         self,
-        counts: pd.DataFrame,
-    ) -> float:
-        assert counts.shape == (
-            2,
-            2,
-        ), f"Cannot run Fisher exact test on an array with {counts.shape} shape"
-        _, pval = scipy.stats.fisher_exact(counts.values, alternative="two-sided")
-        return pval
-
-
-class PythonMultiFisherExact(CountStatistic):
-    """
-    `PythonMultiFisherExact` is a Python implementation of Fisher Exact Test to compute
-    p value for a `2x3` contingency table.
-    """
+    ) -> typing.Sequence[typing.Union[int, typing.Sequence[int], None]]:
+        return self._shape
 
     def compute_pval(
         self,
         counts: pd.DataFrame,
     ) -> float:
-        PythonMultiFisherExact._check_input(counts)
-        return self._fisher_exact(counts.values)
-
-    @staticmethod
-    def _check_input(a: pd.DataFrame):
-        if not isinstance(a, pd.DataFrame):
-            raise ValueError(f"Expected a pandas DataFrame but got {type(a)}")
-        if not a.shape == (2, 3):
-            raise ValueError(f"Shape of the array must be (2, 3) but got {a.shape}")
-        if np.array_equal(a.values, np.zeros_like(a)):
-            raise ValueError("Data frame is all zeros, cannot run analysis")
+        if counts.shape == (2, 2):
+            _, pval = scipy.stats.fisher_exact(counts.values, alternative="two-sided")
+            return pval
+        elif counts.shape == (2, 3):
+            return self._fisher_exact(counts.values)
+        else:
+            raise ValueError(f'Unsupported counts shape {counts.shape}')
 
     def _fisher_exact(
         self,
diff --git a/src/gpsea/analysis/pcats/stats/_test__stats.py b/src/gpsea/analysis/pcats/stats/_test__stats.py
index aa2543063..c8db0b164 100644
--- a/src/gpsea/analysis/pcats/stats/_test__stats.py
+++ b/src/gpsea/analysis/pcats/stats/_test__stats.py
@@ -2,14 +2,14 @@
 import pandas as pd
 import pytest
 
-from ._stats import PythonMultiFisherExact
+from ._stats import FisherExactTest
 
 
 class TestPythonMultiFisherExact:
 
     @pytest.fixture
-    def fisher_exact(self) -> PythonMultiFisherExact:
-        return PythonMultiFisherExact()
+    def fisher_exact(self) -> FisherExactTest:
+        return FisherExactTest()
 
     @pytest.mark.parametrize(
         "counts, expected",
@@ -24,10 +24,9 @@ def test_compute_pval(
         self,
         counts,
         expected,
-        fisher_exact: PythonMultiFisherExact,
+        fisher_exact: FisherExactTest,
     ):
         contingency_matrix = pd.DataFrame(np.array(counts, dtype=np.int64))
 
         final_pval = fisher_exact.compute_pval(contingency_matrix)
         assert final_pval == pytest.approx(expected)
-
diff --git a/tests/analysis/pcats/test_disease.py b/tests/analysis/pcats/test_disease.py
index 48acece3e..9e16b3002 100644
--- a/tests/analysis/pcats/test_disease.py
+++ b/tests/analysis/pcats/test_disease.py
@@ -4,7 +4,7 @@
 from gpsea.model import Cohort
 
 from gpsea.analysis.pcats import DiseaseAnalysis
-from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact
+from gpsea.analysis.pcats.stats import CountStatistic, FisherExactTest
 from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
 from gpsea.analysis.predicate.phenotype import DiseasePresencePredicate
 
@@ -13,7 +13,7 @@ class TestDiseaseAnalysis:
 
     @pytest.fixture(scope='class')
     def count_statistic(self) -> CountStatistic:
-        return ScipyFisherExact()
+        return FisherExactTest()
 
     @pytest.fixture(scope='class')
     def suox_disease(self) -> DiseasePresencePredicate:
diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py
index 516bf9fc5..690e9dd14 100644
--- a/tests/analysis/pcats/test_hpo_term_analysis.py
+++ b/tests/analysis/pcats/test_hpo_term_analysis.py
@@ -7,7 +7,7 @@
 
 from gpsea.analysis.mtc_filter import PhenotypeMtcFilter, HpoMtcFilter
 from gpsea.analysis.pcats import HpoTermAnalysis
-from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact
+from gpsea.analysis.pcats.stats import CountStatistic, FisherExactTest
 from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
 from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
 
@@ -16,7 +16,7 @@ class TestHpoTermAnalysis:
 
     @pytest.fixture(scope='class')
     def count_statistic(self) -> CountStatistic:
-        return ScipyFisherExact()
+        return FisherExactTest()
 
     @pytest.fixture(scope='class')
     def phenotype_mtc_filter(
diff --git a/tests/test_tutorial.py b/tests/test_tutorial.py
index 52924eff4..b592f1f85 100644
--- a/tests/test_tutorial.py
+++ b/tests/test_tutorial.py
@@ -6,7 +6,7 @@
 from gpsea.preprocessing import configure_caching_cohort_creator, CohortCreator, load_phenopackets
 from gpsea.analysis.pcats import HpoTermAnalysis
 from gpsea.analysis.mtc_filter import HpoMtcFilter
-from gpsea.analysis.pcats.stats import ScipyFisherExact
+from gpsea.analysis.pcats.stats import FisherExactTest
 from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate
 from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest
 
@@ -53,7 +53,7 @@ def hpo_term_analysis(
         mtc_filter,
     ) -> HpoTermAnalysis:
         return HpoTermAnalysis(
-            count_statistic=ScipyFisherExact(),
+            count_statistic=FisherExactTest(),
             mtc_filter=mtc_filter,
             mtc_correction='fdr_bh',
             mtc_alpha=0.05,

From ae2e8d00166e292d2c0d53a6fc3a9b5c41db1619 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Mon, 2 Sep 2024 10:03:10 +0200
Subject: [PATCH 09/16] Add docs to `ModeOfInheritancePredicate`.

---
 .../predicate/genotype/_gt_predicates.py      | 49 ++++++++++++++++---
 .../analysis/predicate/genotype/_variant.py   |  2 +-
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index a032aae95..23fc4a5df 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -458,52 +458,89 @@ def __repr__(self) -> str:
 
 
 class ModeOfInheritancePredicate(GenotypePolyPredicate):
+    """
+    `ModeOfInheritancePredicate` assigns an individual into a group based on compatibility with
+    the selected mode of inheritance.
+    """
 
     @staticmethod
     def autosomal_dominant(
         variant_predicate: VariantPredicate,
     ) -> "ModeOfInheritancePredicate":
+        """
+        Create a predicate that assigns the patient either
+        into homozygous reference or heterozygous
+        group in line with the autosomal dominant mode of inheritance.
+
+        :param variant_predicate: a predicate for choosing the variants for testing.
+        """
         return ModeOfInheritancePredicate.from_moi_info(
             variant_predicate=variant_predicate,
-            mode_of_inheritance_info=ModeOfInheritanceInfo.autosomal_dominant(),
+            mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(),
         )
     
     @staticmethod
     def autosomal_recessive(
         variant_predicate: VariantPredicate,
     ) -> "ModeOfInheritancePredicate":
+        """
+        Create a predicate that assigns the patient either into
+        homozygous reference, heterozygous, or biallelic alternative allele
+        (homozygous alternative or compound heterozygous)
+        group in line with the autosomal recessive mode of inheritance.
+
+        :param variant_predicate: a predicate for choosing the variants for testing.
+        """
         return ModeOfInheritancePredicate.from_moi_info(
             variant_predicate=variant_predicate,
-            mode_of_inheritance_info=ModeOfInheritanceInfo.autosomal_recessive(),
+            mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(),
         )
     
     @staticmethod
     def x_dominant(
         variant_predicate: VariantPredicate,
     ) -> "ModeOfInheritancePredicate":
+        """
+        Create a predicate that assigns the patient either into
+        homozygous reference or heterozygous
+        group in line with the X-linked dominant mode of inheritance.
+        
+        :param variant_predicate: a predicate for choosing the variants for testing.
+        """
         return ModeOfInheritancePredicate.from_moi_info(
             variant_predicate=variant_predicate,
-            mode_of_inheritance_info=ModeOfInheritanceInfo.x_dominant(),
+            mode_of_inheritance_data=ModeOfInheritanceInfo.x_dominant(),
         )
 
     @staticmethod
     def x_recessive(
         variant_predicate: VariantPredicate,
     ) -> "ModeOfInheritancePredicate":
+        """
+        Create a predicate that assigns the patient either into
+        homozygous reference, heterozygous, biallelic alternative allele
+        (homozygous alternative or compound heterozygous), or hemizygous
+        group in line with the X-linked recessive mode of inheritance.
+
+        :param variant_predicate: a predicate for choosing the variants for testing.
+        """
         return ModeOfInheritancePredicate.from_moi_info(
             variant_predicate=variant_predicate,
-            mode_of_inheritance_info=ModeOfInheritanceInfo.x_recessive(),
+            mode_of_inheritance_data=ModeOfInheritanceInfo.x_recessive(),
         )
 
     @staticmethod
     def from_moi_info(
         variant_predicate: VariantPredicate,
-        mode_of_inheritance_info: ModeOfInheritanceInfo,
+        mode_of_inheritance_data: ModeOfInheritanceInfo,
     ) -> "ModeOfInheritancePredicate":
+        """
+        Create a predicate for specified mode of inheritance data.
+        """
         allele_counter = AlleleCounter(predicate=variant_predicate)
         return ModeOfInheritancePredicate(
             allele_counter=allele_counter,
-            mode_of_inheritance_info=mode_of_inheritance_info,
+            mode_of_inheritance_info=mode_of_inheritance_data,
         )
 
     def __init__(
diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py
index aeaf6e1df..54d397352 100644
--- a/src/gpsea/analysis/predicate/genotype/_variant.py
+++ b/src/gpsea/analysis/predicate/genotype/_variant.py
@@ -313,7 +313,7 @@ def is_structural_deletion(
             to determine if the length of the variant is above or below `threshold`.
 
             **IMPORTANT**: the change lengths of deletions are *negative*, since the alternate allele
-            is shorter than the reference allele. See the method's documentation for more info.
+            is shorter than the reference allele. See :ref:`change-length-of-an-allele` for more info.
 
         **Example**
 

From 304df63f25dd4aac4d66c87d63913fe7371cae97 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Mon, 2 Sep 2024 10:13:44 +0200
Subject: [PATCH 10/16] Fix compatibility checking bug.

---
 src/gpsea/analysis/pcats/_impl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
index 6600dab46..3448ba93b 100644
--- a/src/gpsea/analysis/pcats/_impl.py
+++ b/src/gpsea/analysis/pcats/_impl.py
@@ -331,7 +331,7 @@ def _check_compatibility(
 
         pheno_failed = []
         for i, ph_predicate in enumerate(pheno_predicates):
-            if ph_predicate.n_categorizations not in pheno_accepted:
+            if ph_predicate.n_categorizations() not in pheno_accepted:
                 pheno_failed.append(i)
         if len(pheno_failed) != 0:
             issues.append(
@@ -350,7 +350,7 @@ def _check_compatibility(
         else:
             raise ValueError(f'Cannot use a count statistic that supports shape {pheno, geno}')
         
-        if gt_predicate.n_categorizations not in geno_accepted:
+        if gt_predicate.n_categorizations() not in geno_accepted:
             issues.append('Genotype predicate is incompatible with the count statistic')
         
         return issues

From ade5d5c71913d83fb9721c65cec7511ef60bf21a Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 3 Sep 2024 11:29:08 +0200
Subject: [PATCH 11/16] Remove the obsolete
 `PhenotypeMtcFilter.filter_terms_to_test` method.

---
 src/gpsea/analysis/mtc_filter/_impl.py        | 127 -----------------
 src/gpsea/analysis/pcats/__init__.py          |   2 +
 src/gpsea/analysis/pcats/_impl.py             |   2 +
 .../analysis/pcats/test_hpo_term_analysis.py  |  56 +++++++-
 tests/analysis/test_analysis.py               |  26 +---
 tests/analysis/test_mtc_filter.py             | 130 ++++++++++--------
 6 files changed, 124 insertions(+), 219 deletions(-)

diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py
index 0aca8cc7d..82a044c13 100644
--- a/src/gpsea/analysis/mtc_filter/_impl.py
+++ b/src/gpsea/analysis/mtc_filter/_impl.py
@@ -90,36 +90,6 @@ def filter(
         """
         pass
 
-    @abc.abstractmethod
-    def filter_terms_to_test(
-        self,
-        gt_predicate: GenotypePolyPredicate,
-        n_usable: typing.Mapping[hpotk.TermId, int],
-        all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame],
-    ) -> typing.Tuple[
-        typing.Mapping[hpotk.TermId, int],
-        typing.Mapping[hpotk.TermId, pd.DataFrame],
-        typing.Mapping[str, int],
-    ]:
-        """
-        Decide which terms to pass through for statistical testing.
-        The intention of this class is to reduce multiple testing burden by removing terms that are unlikely
-        to lead to interesting statistical/analytical results.
-
-        Args:
-            gt_predicate: the predicate used to bin patients into groups along the genotype axis
-            n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients
-              that could be binned according to the used genotype/phenotype predicate.
-            all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients
-              in the i-th phenotype (rows) and j-th genotype (column) group
-        Returns:
-           a tuple with three items:
-            - a mapping from :class:`hpotk.TermId` ->
-            - a mapping from :class:`hpotk.TermId` ->
-            - a mapping from a `str` with reason why a term was filtered out (e.g. *Skipping top level term*)
-        """
-        pass
-
     @abc.abstractmethod
     def filter_method_name(self) -> str:
         """
@@ -453,103 +423,6 @@ def filter(
 
         return tuple(results)
 
-    def filter_terms_to_test(
-        self,
-        gt_predicate: GenotypePolyPredicate,
-        n_usable: typing.Mapping[hpotk.TermId, int],
-        all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame],
-    ) -> typing.Tuple[
-        typing.Mapping[hpotk.TermId, int],
-        typing.Mapping[hpotk.TermId, pd.DataFrame],
-        typing.Mapping[str, int],
-    ]:
-        """
-        Args:
-            gt_predicate: the predicate used to bin patients into groups along the genotype axis
-            n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients
-              that could be binned according to the used genotype/phenotype predicate.
-            all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients
-              in the i-th phenotype (rows) and j-th genotype (column) group
-        """
-        filtered_n_usable = {}
-        filtered_all_counts = {}
-        reason_for_filtering_out = defaultdict(int)
-        tested_counts_pf = defaultdict(
-            pd.DataFrame
-        )  # key is an HP id, value is a tuple with counts, i.e.,
-
-        # iterate through the terms in the sorted order, starting from the leaves of the induced graph.
-        categories = tuple(gt_predicate.get_categories())
-        for term_id in self._get_ordered_terms(all_counts.keys()):
-            if term_id in self._general_hpo_terms:
-                reason_for_filtering_out["Skipping general term"] += 1
-                continue
-            
-            if not self._hpo.graph.is_ancestor_of(
-                hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id
-            ):
-                reason_for_filtering_out["Skipping non phenotype term"] += 1
-                continue
-
-            # get total number of observations
-            if term_id not in all_counts:
-                reason_for_filtering_out["Skipping non-target term"] += 1
-                continue
-
-            counts_frame = all_counts[term_id]
-            total = counts_frame.sum().sum()
-            max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
-                counts_frame
-            )
-            if max_freq < self._hpo_term_frequency_filter:
-                reason = (
-                    "Skipping term with maximum frequency "
-                    f"that was less than threshold {self._hpo_term_frequency_filter}"
-                )
-                reason_for_filtering_out[reason] += 1
-                continue
-
-            if counts_frame.shape == (2, 2) and total < 7:
-                reason = f"Skipping term with only {total} observations (not powered for 2x2)"
-                reason_for_filtering_out[reason] += 1
-                continue
-
-            # todo -- similar for (3,2)
-            if not HpoMtcFilter.some_cell_has_greater_than_one_count(counts_frame):
-                reason = "Skipping term because no genotype has more than one observed HPO count"
-                reason_for_filtering_out[reason] += 1
-                continue
-
-            elif HpoMtcFilter.genotypes_have_same_hpo_proportions(
-                counts_frame,
-                categories,
-            ):
-                reason = "Skipping term because all genotypes have same HPO observed proportions"
-                reason_for_filtering_out[reason] += 1
-                continue
-
-            elif HpoMtcFilter.one_genotype_has_zero_hpo_observations(
-                counts_frame,
-                categories,
-            ):
-                reason = "Skipping term because one genotype had zero observations"
-                reason_for_filtering_out[reason] += 1
-                continue
-
-            for child_term_id in self._hpo.graph.get_children(term_id):
-                if child_term_id in tested_counts_pf:
-                    if tested_counts_pf[child_term_id].equals(counts_frame):
-                        # TODO: should we make the match fuzzier by adding a tolerance instead of the exact matches?
-                        reason = "Child term with same counts previously tested"
-                        reason_for_filtering_out[reason] += 1
-                        continue
-            
-            # if we get here, then we include the test for `term_id`
-            filtered_n_usable[term_id] = n_usable[term_id]
-            filtered_all_counts[term_id] = all_counts[term_id]
-
-        return filtered_n_usable, filtered_all_counts, reason_for_filtering_out
-
     def filter_method_name(self) -> str:
         return "HPO MTC filter"
 
diff --git a/src/gpsea/analysis/pcats/__init__.py b/src/gpsea/analysis/pcats/__init__.py
index 6a82d92f1..59eceb977 100644
--- a/src/gpsea/analysis/pcats/__init__.py
+++ b/src/gpsea/analysis/pcats/__init__.py
@@ -21,9 +21,11 @@
 from ._impl import MultiPhenotypeAnalysis, MultiPhenotypeAnalysisResult
 from ._impl import DiseaseAnalysis
 from ._impl import HpoTermAnalysis, HpoTermAnalysisResult
+from ._impl import apply_predicates_on_patients
 
 __all__ = [
     'MultiPhenotypeAnalysis', 'MultiPhenotypeAnalysisResult',
     'DiseaseAnalysis',
     'HpoTermAnalysis', 'HpoTermAnalysisResult',
+    'apply_predicates_on_patients',
 ]
diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
index 3448ba93b..43d8c7077 100644
--- a/src/gpsea/analysis/pcats/_impl.py
+++ b/src/gpsea/analysis/pcats/_impl.py
@@ -606,6 +606,8 @@ def _compute_result(
             counts=all_counts,
         )
         mtc_mask = np.array([r.is_passed() for r in mtc_filter_results])
+        if not mtc_mask.any():
+            raise ValueError("No phenotypes are left for the analysis after MTC filtering step")
 
         # 3 - Compute nominal p values
         pvals = np.full(shape=(len(n_usable),), fill_value=np.nan)
diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py
index 690e9dd14..dbd8d508f 100644
--- a/tests/analysis/pcats/test_hpo_term_analysis.py
+++ b/tests/analysis/pcats/test_hpo_term_analysis.py
@@ -14,18 +14,18 @@
 
 class TestHpoTermAnalysis:
 
-    @pytest.fixture(scope='class')
+    @pytest.fixture(scope="class")
     def count_statistic(self) -> CountStatistic:
         return FisherExactTest()
 
-    @pytest.fixture(scope='class')
+    @pytest.fixture(scope="class")
     def phenotype_mtc_filter(
         self,
         hpo: hpotk.MinimalOntology,
     ) -> PhenotypeMtcFilter:
         return HpoMtcFilter.default_filter(
             hpo=hpo,
-            term_frequency_threshold=.2,
+            term_frequency_threshold=0.2,
         )
 
     @pytest.fixture
@@ -37,8 +37,8 @@ def analysis(
         return HpoTermAnalysis(
             count_statistic=count_statistic,
             mtc_filter=phenotype_mtc_filter,
-            mtc_correction='fdr_bh',
-            mtc_alpha=.05,
+            mtc_correction="fdr_bh",
+            mtc_alpha=0.05,
         )
 
     def test_compare_genotype_vs_phenotypes(
@@ -46,7 +46,7 @@ def test_compare_genotype_vs_phenotypes(
         analysis: HpoTermAnalysis,
         suox_cohort: Cohort,
         suox_gt_predicate: GenotypePolyPredicate,
-        suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]]
+        suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
     ):
         results = analysis.compare_genotype_vs_phenotypes(
             cohort=suox_cohort.all_patients,
@@ -55,4 +55,46 @@ def test_compare_genotype_vs_phenotypes(
         )
 
         assert results is not None
-        # TODO: improve testing
+
+        assert results.total_tests == 4
+        assert results.n_usable == (35, 18, 13, 25, 23)
+        assert results.pvals == pytest.approx(
+            [
+                0.0721291631224236,
+                1.0,
+                float("nan"),
+                0.35921595820909313,
+                0.6668461434917216,
+            ],
+            nan_ok=True,
+        )
+        assert results.corrected_pvals is not None
+        assert results.corrected_pvals == pytest.approx(
+            [
+                0.2885166524896944,
+                1.0,
+                float("nan"),
+                0.7184319164181863,
+                0.8891281913222954,
+            ],
+            nan_ok=True,
+        )
+
+    def test_compare_genotype_vs_phenotypes_explodes_if_no_phenotypes_are_left_after_mtc_filter(
+        self,
+        analysis: HpoTermAnalysis,
+        degenerated_cohort: Cohort,
+        suox_gt_predicate: GenotypePolyPredicate,
+        suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
+    ):
+        with pytest.raises(ValueError) as e:
+            analysis.compare_genotype_vs_phenotypes(
+                cohort=degenerated_cohort,
+                gt_predicate=suox_gt_predicate,
+                pheno_predicates=suox_pheno_predicates,
+            )
+
+        assert (
+            e.value.args[0]
+            == "No phenotypes are left for the analysis after MTC filtering step"
+        )
diff --git a/tests/analysis/test_analysis.py b/tests/analysis/test_analysis.py
index 5db246218..ea5408e28 100644
--- a/tests/analysis/test_analysis.py
+++ b/tests/analysis/test_analysis.py
@@ -8,10 +8,11 @@
 
 from gpsea.analysis import (
     apply_predicates_on_patients,
-    CohortAnalysis, CohortAnalysisConfiguration,
+    CohortAnalysis,
     configure_cohort_analysis,
 )
 
+from gpsea.analysis.mtc_filter._impl import HpoMtcFilter
 from gpsea.model import *
 from gpsea.model.genome import *
 from gpsea.analysis.predicate import PatientCategories
@@ -69,26 +70,3 @@ def test_analysis_passes_if_variant_predicate_always_returns_false(
             predicate=always_false_variant_predicate
         )
         assert results is not None
-
-    def test_analysis_explodes_if_no_phenotypes_are_left_for_analysis(
-        self,
-        hpo: hpotk.MinimalOntology,
-        degenerated_cohort: Cohort,
-        tmp_path: pathlib.Path,
-        always_false_variant_predicate: VariantPredicate,
-    ):
-        config = CohortAnalysisConfiguration()
-        config.hpo_mtc_strategy()
-        cohort_analysis = configure_cohort_analysis(
-            hpo=hpo,
-            cohort=degenerated_cohort,
-            cache_dir=str(tmp_path),
-            config=config,
-        )
-
-        with pytest.raises(ValueError) as e:
-            _ = cohort_analysis.compare_hpo_vs_genotype(
-                predicate=always_false_variant_predicate,
-            )
-
-        assert e.value.args[0] == 'No phenotypes are left for the analysis after MTC filtering step'
diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py
index c07600ef2..af6f14392 100644
--- a/tests/analysis/test_mtc_filter.py
+++ b/tests/analysis/test_mtc_filter.py
@@ -1,3 +1,4 @@
+from itertools import count
 import typing
 
 import hpotk
@@ -5,11 +6,11 @@
 import pandas as pd
 import pytest
 
-from gpsea.analysis import apply_predicates_on_patients
 from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter
 from gpsea.analysis.predicate import PatientCategories
 from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
 from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
+from gpsea.analysis.pcats import apply_predicates_on_patients
 from gpsea.model import Cohort
 
 
@@ -31,16 +32,13 @@ def patient_counts(
             suox_cohort: Cohort,
             suox_gt_predicate: GenotypePolyPredicate,
             suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
-    ) -> typing.Tuple[
-        typing.Mapping[hpotk.TermId, int],
-        typing.Mapping[hpotk.TermId, pd.DataFrame],
-    ]:
-        categories, n_usable, all_counts = apply_predicates_on_patients(
+    ) -> typing.Sequence[pd.DataFrame]:
+        _, counts = apply_predicates_on_patients(
             patients=suox_cohort.all_patients,
             gt_predicate=suox_gt_predicate,
             pheno_predicates=suox_pheno_predicates,
         )
-        return n_usable, all_counts
+        return counts
 
     @pytest.fixture(scope='class')
     def gt_categories(self) -> pd.Index:
@@ -135,74 +133,80 @@ def test_genotypes_have_same_hpo_proportions(
         assert actual == expected
 
     def test_filter_terms_to_test(
-            self,
-            mtc_filter: HpoMtcFilter,
-            suox_gt_predicate: GenotypePolyPredicate,
-            patient_counts: typing.Tuple[
-                typing.Mapping[hpotk.TermId, int],
-                typing.Mapping[hpotk.TermId, pd.DataFrame],
-            ],
+        self,
+        mtc_filter: HpoMtcFilter,
+        suox_gt_predicate: GenotypePolyPredicate,
+        suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
+        patient_counts: typing.Sequence[pd.DataFrame],
     ):
-        n_usable, all_counts = patient_counts
-        mtc_report = mtc_filter.filter_terms_to_test(
-            suox_gt_predicate,
-            n_usable,
-            all_counts,
+        phenotypes = [p.phenotype for p in suox_pheno_predicates]
+        
+        mtc_report = mtc_filter.filter(
+            gt_predicate=suox_gt_predicate,
+            phenotypes=phenotypes,
+            counts=patient_counts,
         )
 
-        assert isinstance(mtc_report, tuple)
-        assert len(mtc_report) == 3
-
-        filtered_n_usable, filtered_all_counts, reason_for_filtering_out = mtc_report
+        assert isinstance(mtc_report, typing.Sequence)
+        assert len(mtc_report) == 5
 
-        assert reason_for_filtering_out['Skipping term because all genotypes have same HPO observed proportions'] == 1
-        assert reason_for_filtering_out['Skipping general term'] == 16
-        assert reason_for_filtering_out['Skipping non-target term'] == 5
-        assert reason_for_filtering_out['Skipping top level term'] == 0
+        is_ok = [r.is_passed() for r in mtc_report]
+        assert is_ok == [True, True, False, True, True]
 
-        assert len(filtered_n_usable) == 4
-        assert len(filtered_all_counts) == 4
+        reasons = [r.reason for r in mtc_report]
+        assert reasons == [
+            None, None,
+            'Skipping term because all genotypes have same HPO observed proportions',
+            None, None,
+        ]
 
     def test_specified_term_mtc_filter(
-            self,
-            suox_gt_predicate: GenotypePolyPredicate,
-            patient_counts: typing.Tuple[
-                typing.Mapping[hpotk.TermId, int],
-                typing.Mapping[hpotk.TermId, pd.DataFrame],
-            ],
+        self,
+        suox_gt_predicate: GenotypePolyPredicate,
+        suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
+        patient_counts: typing.Sequence[pd.DataFrame],
     ):
         """
         The point of this test is to check that if we filter to test only one term ("HP:0032350"), then this
         is the only term that should survive the filter. We start with a total of five terms (n_usable==5),
-        but after our filter, only one survives (filtered_n_usable == 1), and we have four cases in which the
-        reason for filtering out is 'Skipping non-specified term'
+        but after our filter, only one survives, and we have four cases in which the
+        reason for filtering out is 'Non-specified term'
         """
-        specified_filter = SpecifiedTermsMtcFilter(terms_to_test={hpotk.TermId.from_curie("HP:0032350")})
-        n_usable, all_counts = patient_counts
-        mtc_report = specified_filter.filter_terms_to_test(
-            suox_gt_predicate,
-            n_usable,
-            all_counts,
+        specified_filter = SpecifiedTermsMtcFilter(
+            terms_to_test=(hpotk.TermId.from_curie("HP:0032350"),),
+        )
+        phenotypes = [p.phenotype for p in suox_pheno_predicates]
+        
+        mtc_report = specified_filter.filter(
+            gt_predicate=suox_gt_predicate,
+            phenotypes=phenotypes,
+            counts=patient_counts,
         )
-        assert isinstance(mtc_report, tuple)
-        assert len(mtc_report) == 3  # # Skipping non-specified term (n=5)
+        assert isinstance(mtc_report, typing.Sequence)
+        assert len(mtc_report) == 5
+
+        is_passed = [r.is_passed() for r in mtc_report]
+        assert is_passed == [
+            False, False, True, False, False,
+        ]
 
-        filtered_n_usable, filtered_all_counts, reason_for_filtering_out = mtc_report
-        assert len(n_usable) == 5
-        assert len(filtered_n_usable) == 1
-        assert reason_for_filtering_out['Skipping non-specified term'] == 4
+        reasons = [r.reason for r in mtc_report]
+        assert reasons == [
+            'Non-specified term',
+            'Non-specified term',
+            None,
+            'Non-specified term',
+            'Non-specified term',
+        ]
 
     def test_min_observed_HPO_threshold(
         self,
-        patient_counts: typing.Tuple[
-            typing.Mapping[hpotk.TermId, int],
-            typing.Mapping[hpotk.TermId, pd.DataFrame],
-        ],
+        suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
+        patient_counts: typing.Sequence[pd.DataFrame],
     ):
         """
         In our heuristic filter, we only test terms that have at least a threshold
-        frequency in at least one of the groups. We use the "all counts" datastructure, that
-        is a dictionary whose keys are hpotoolkit TermIds and whose values are pandas DataFrames
+        frequency in at least one of the groups. We use the `patient_counts` - a sequence of DataFrames
         with 2x2 contingenicy tables of counts. For instance, each column will have one row for
         PatientCategories.YES and one for PatientCategories.NO, indicating counts of measured observed/excluded
         HPO phenotypes. Each column is a certain genotype, e.g., MISSENSE or NON-MISSENSE. We want the
@@ -211,28 +215,32 @@ def test_min_observed_HPO_threshold(
         for all of the HPO terms in the dictionary.
         """
         EPSILON = 0.001
-        _, all_counts = patient_counts
+        curie2idx = {
+            p.phenotype.value: i
+            for i, p
+            in enumerate(suox_pheno_predicates)
+        }
         # Ectopia lentis HP:0001083  (6 9  1 2), freqs are 6/15=0.4 and 1/3=0.33
-        ectopia = all_counts[hpotk.TermId.from_curie("HP:0001083")]
+        ectopia = patient_counts[curie2idx["HP:0001083"]]
         max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ectopia)
         assert max_f == pytest.approx(0.4, abs=EPSILON)
         
         # Seizure HP:0001250 (17 7 11 0), freqs are 17/24=0.7083 and 11/11=1
-        seizure = all_counts[hpotk.TermId.from_curie("HP:0001250")]
+        seizure = patient_counts[curie2idx["HP:0001250"]]
         max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(seizure)
         assert max_f == pytest.approx(1.0, abs=EPSILON)
         
         # Sulfocysteinuria HP:0032350 (11 0 2 0), freqs are both 1
-        sulfocysteinuria = all_counts[hpotk.TermId.from_curie("HP:0032350")]
+        sulfocysteinuria = patient_counts[curie2idx["HP:0032350"]]
         max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(sulfocysteinuria)
         assert max_f == pytest.approx(1.0, abs=EPSILON)
         
         # Neurodevelopmental delay HP:0012758 (4 13 4 4), freqs are 4/17 = 0.235 and 4/8=0.5
-        ndelay = all_counts[hpotk.TermId.from_curie("HP:0012758")]
+        ndelay = patient_counts[curie2idx["HP:0012758"]]
         max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ndelay)
         assert max_f == pytest.approx(0.5, abs=EPSILON)
         
         # Hypertonia HP:0001276 (7 9 4 3) fresa are 7/16=0.4375 and 4/7=0.5714
-        hypertonia = all_counts[hpotk.TermId.from_curie("HP:0001276")]
+        hypertonia = patient_counts[curie2idx["HP:0001276"]]
         max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(hypertonia)
         assert max_f == pytest.approx(0.5714, abs=EPSILON)

From ab0a116e13baa937ec3a6f57a928cdfd3fa632f3 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 3 Sep 2024 11:56:20 +0200
Subject: [PATCH 12/16] Add
 `PhenotypePolyPredicate.present_phenotype_categorization` property.

---
 .../analysis/predicate/phenotype/_pheno.py    | 30 +++++++++++++++----
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py
index c2646f727..9a9c496c8 100644
--- a/src/gpsea/analysis/predicate/phenotype/_pheno.py
+++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py
@@ -60,11 +60,12 @@ class PhenotypePolyPredicate(
 
     Each phenotype category `P` can be a :class:`~hpotk.model.TermId` representing an HPO term or an OMIM/MONDO term.
 
-    Most of the time, only one category is investigated, and :attr:`phenotype` will return a sequence with
-    one element only (e.g. *Arachnodactyly* `HP:0001166`). However, multiple categories can be sought as well,
-    such as when the predicate bins the patient into one of discrete diseases
-    (e.g. Geleophysic dysplasia, Marfan syndrome, ...). Then the predicate should return a sequence of disease
-    identifiers.
+    Only one category can be investigated, and :attr:`phenotype` returns the investigated phenotype
+    (e.g. *Arachnodactyly* `HP:0001166`).
+
+    As another hallmark of this predicate, one of the categorizations must correspond to the group of patients
+    who exibit the investigated phenotype. The categorization is provided
+    via :attr:`present_phenotype_categorization` property.
     """
 
     @property
@@ -75,6 +76,14 @@ def phenotype(self) -> P:
         """
         pass
 
+    @property
+    @abc.abstractmethod
+    def present_phenotype_categorization(self) -> PhenotypeCategorization[P]:
+        """
+        Get the categorization which represents the group of the patients who exibit the investigated phenotype.
+        """
+        pass
+
 
 class PropagatingPhenotypePredicate(PhenotypePolyPredicate[hpotk.TermId]):
     """
@@ -112,6 +121,7 @@ def __init__(
             category=PatientCategories.NO,
             phenotype=self._query,
         )
+        self._categorizations = (self._phenotype_observed, self._phenotype_excluded)
 
     def get_question(self) -> str:
         return f"Is {self._query_label} present in the patient?"
@@ -120,10 +130,14 @@ def get_question(self) -> str:
     def phenotype(self) -> hpotk.TermId:
         return self._query
 
+    @property
+    def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]:
+        return self._phenotype_observed
+
     def get_categorizations(
         self,
     ) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]:
-        return self._phenotype_observed, self._phenotype_excluded
+        return self._categorizations
 
     def test(
         self, patient: Patient
@@ -207,6 +221,10 @@ def get_question(self) -> str:
     def phenotype(self) -> hpotk.TermId:
         return self._query
 
+    @property
+    def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]:
+        return self._diagnosis_present
+
     def get_categorizations(
         self,
     ) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]:

From b35790b8a6794428b2eef0d4fe3d830e6e6b8650 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 3 Sep 2024 16:07:11 +0200
Subject: [PATCH 13/16] Improve `PhenotypeMtcFilter` tests.

---
 docs/report/tbx5_frameshift_vs_missense.csv   |  36 ++--
 ...bx5_frameshift_vs_missense.mtc_report.html |  21 +-
 docs/tutorial.rst                             |   8 +-
 src/gpsea/analysis/mtc_filter/_impl.py        | 202 ++++++------------
 src/gpsea/analysis/pcats/_impl.py             |   2 +-
 .../analysis/predicate/phenotype/_pheno.py    |   8 +
 tests/analysis/test_analysis.py               |  72 -------
 tests/analysis/test_examples.py               |   2 +
 tests/analysis/test_mtc_filter.py             | 151 +++++++++----
 9 files changed, 219 insertions(+), 283 deletions(-)
 delete mode 100644 tests/analysis/test_analysis.py

diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv
index 07245a440..0bc1e5c6c 100644
--- a/docs/report/tbx5_frameshift_vs_missense.csv
+++ b/docs/report/tbx5_frameshift_vs_missense.csv
@@ -1,22 +1,22 @@
 "Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,,
 ,Count,Percent,Count,Percent,Corrected p values,p values
-Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05
-Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214
-Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015
-Heart block [HP:0012722],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015
-Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.018011232501600187,0.005628510156750059
-Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.03598740440845704,0.01349527665317139
-Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.058517997576307476,0.02560162393963452
-Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.14881278039172777,0.07440639019586388
-Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2532484592139712,0.1424522583078588
-Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2699930339748754,0.1687456462342971
-Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6233766233766234,0.42857142857142855
-Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.7619047619047618,0.5714285714285713
-Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.9520604334894502,0.7735491022101784
-Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0
+Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05
+Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214
+Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015
+Heart block [HP:0012722],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015
+Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.0191369345329502,0.005628510156750059
+Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.038236617183985605,0.01349527665317139
+Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.062175372424826694,0.02560162393963452
+Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.15811357916621074,0.07440639019586388
+Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2690764879148444,0.1424522583078588
+Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2868675985983051,0.1687456462342971
+Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855
+Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713
+Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784
+Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0
 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0
 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
-Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,,
-Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,,
-Abnormality of the upper limb [HP:0002817],73/73,100%,34/34,100%,,
-Abnormality of limbs [HP:0040064],73/73,100%,34/34,100%,,
+Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0
+Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,,
+Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,,
+Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,,
diff --git a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html
index 5b149af79..b6ac17472 100644
--- a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html
+++ b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html
@@ -48,9 +48,9 @@
   <h1>Phenotype testing report</h1>
   <p>Phenotype MTC filter: <em>HPO MTC filter</em></p>
   <p>Multiple testing correction: <em>fdr_bh</em></p>
-  <p>Performed statistical tests for 16 out of the total of 260 HPO terms.</p>
+  <p>Performed statistical tests for 17 out of the total of 260 HPO terms.</p>
     <table>
-      <caption>Using <em>HPO MTC filter</em>, 244 term(s) were omitted from statistical analysis.</caption>
+      <caption>Using <em>HPO MTC filter</em>, 243 term(s) were omitted from statistical analysis.</caption>
         <tbody>
           <tr>
             <th>Code</th>
@@ -61,21 +61,21 @@ <h1>Phenotype testing report</h1>
           <tr>
             <!-- TODO: plug the real reason code here -->
             <td>TODO</td>
-            <td>Skipping general term</td>
-            <td>44</td>
+            <td>Skipping term with maximum frequency that was less than threshold 0.2</td>
+            <td>51</td>
           </tr>
           
           <tr>
             <!-- TODO: plug the real reason code here -->
             <td>TODO</td>
-            <td>Skipping term because all genotypes have same HPO observed proportions</td>
-            <td>42</td>
+            <td>Skipping general term</td>
+            <td>44</td>
           </tr>
           
           <tr>
             <!-- TODO: plug the real reason code here -->
             <td>TODO</td>
-            <td>Skipping term with only 0 observations (not powered for 2x2)</td>
+            <td>Skipping term because all genotypes have same HPO observed proportions</td>
             <td>41</td>
           </tr>
           
@@ -114,13 +114,6 @@ <h1>Phenotype testing report</h1>
             <td>12</td>
           </tr>
           
-          <tr>
-            <!-- TODO: plug the real reason code here -->
-            <td>TODO</td>
-            <td>Skipping term with maximum frequency that was less than threshold 0.2</td>
-            <td>10</td>
-          </tr>
-          
           <tr>
             <!-- TODO: plug the real reason code here -->
             <td>TODO</td>
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index e047943ce..29d8f2d9a 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -246,15 +246,15 @@ Now we can perform the analysis and investigate the results.
 ...     pheno_predicates=pheno_predicates,
 ... )
 >>> result.total_tests
-16
+17
 
-We only tested 16 HPO terms. This is despite the individuals being collectively annotated with
+We only tested 1y HPO terms. This is despite the individuals being collectively annotated with
 260 direct and indirect HPO terms
 
 >>> len(result.phenotypes)
 260
 
-We can show the reasoning behind *not* testing 244 (`260 - 16`) HPO terms
+We can show the reasoning behind *not* testing 243 (`260 - 17`) HPO terms
 by exploring the phenotype MTC filtering report.
 
 >>> from gpsea.view import MtcStatsViewer
@@ -283,4 +283,4 @@ was observed in 31/60 (52%) patients with a missense variant
 but it was observed in 19/19 (100%) patients with a frameshift variant.
 Fisher exact test computed a p value of `~0.0000562`
 and the p value corrected by Benjamini-Hochberg procedure
-is `~0.000899`.
+is `~0.000955`.
diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py
index 82a044c13..c52d10b9c 100644
--- a/src/gpsea/analysis/mtc_filter/_impl.py
+++ b/src/gpsea/analysis/mtc_filter/_impl.py
@@ -1,14 +1,15 @@
 import abc
 import typing
 
-from collections import defaultdict, deque
+from collections import deque
 
 import hpotk
+from matplotlib import axis
+import numpy as np
 import pandas as pd
 
-from ..predicate import PatientCategories, PatientCategory
 from ..predicate.genotype import GenotypePolyPredicate
-from ..predicate.phenotype import P
+from ..predicate.phenotype import PhenotypePolyPredicate, P
 
 
 class PhenotypeMtcResult:
@@ -76,14 +77,14 @@ class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta):
     def filter(
         self,
         gt_predicate: GenotypePolyPredicate,
-        phenotypes: typing.Sequence[P],
+        ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
         counts: typing.Sequence[pd.DataFrame],
     ) -> typing.Sequence[PhenotypeMtcResult]:
         """
         Test if the phenotype with given counts should be included in the downstream analysis.
 
         :param gt_predicate: the predicate that produced the columns of the `count` data frame.
-        :param phenotypes: the tested phenotypes.
+        :param ph_predicates: the phenotype predicates that produced the rows of the `counts` data frames.
         :param counts: a sequence of 2D data frames for the tested phenotypes.
             Each data frame corrresponds to a genotype/phenotype contingency matrix.
         :returns: a sequence of filter results for the input `phenotypes`.
@@ -111,27 +112,11 @@ def __init__(self):
     def filter(
         self,
         gt_predicate: GenotypePolyPredicate,
-        phenotypes: typing.Sequence[P],
+        ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
         counts: typing.Sequence[pd.DataFrame],
     ) -> typing.Sequence[PhenotypeMtcResult]:
         # Always OK! 😏
-        return tuple(self._ok for _ in phenotypes)
-
-    def filter_terms_to_test(
-        self,
-        gt_predicate: GenotypePolyPredicate,
-        n_usable: typing.Mapping[hpotk.TermId, int],
-        all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame],
-    ) -> typing.Tuple[
-        typing.Mapping[hpotk.TermId, int],
-        typing.Mapping[hpotk.TermId, pd.DataFrame],
-        typing.Mapping[str, int],
-    ]:
-        """
-        Use this implementation to test all available HPO terms.
-        No HPO terms will be filtered out!
-        """
-        return n_usable, all_counts, {}
+        return tuple(self._ok for _ in ph_predicates)
 
     def filter_method_name(self) -> str:
         return "All HPO terms"
@@ -164,55 +149,17 @@ def __init__(
     def filter(
         self,
         gt_predicate: GenotypePolyPredicate,
-        phenotypes: typing.Sequence[P],
+        ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
         counts: typing.Sequence[pd.DataFrame],
     ) -> typing.Sequence[PhenotypeMtcResult]:
         results = []
-        for phenotype in phenotypes:
-            if phenotype in self._terms_to_test_set:
+        for predicate in ph_predicates:
+            if predicate.phenotype in self._terms_to_test_set:
                 results.append(self._ok)
             else:
                 results.append(self._fail)
         return tuple(results)
 
-    def filter_terms_to_test(
-        self,
-        gt_predicate: GenotypePolyPredicate,
-        n_usable: typing.Mapping[hpotk.TermId, int],
-        all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame],
-    ) -> typing.Tuple[
-        typing.Mapping[hpotk.TermId, int],
-        typing.Mapping[hpotk.TermId, pd.DataFrame],
-        typing.Mapping[str, int],
-    ]:
-        """
-        Remove terms that are not members of the specific set of HPO terms to be tested.
-        
-        Args:
-            gt_predicate: the predicate used to bin patients into groups along the genotype axis
-            n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients
-              that could be binned according to the used genotype/phenotype predicate.
-            all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients
-              in the i-th phenotype (rows) and j-th genotype (column) group
-
-        Returns:
-            filtered versions of the two dictionaries above and dataframe with reasons for skipping
-        """
-        filtered_n_usable = {}
-        filtered_all_counts = {}
-        reason_for_filtering_out: typing.DefaultDict[str, int] = defaultdict(int)
-
-        for term_id in all_counts.keys():
-            if term_id not in self._terms_to_test_set:
-                reason_for_filtering_out["Skipping non-specified term"] += 1
-                continue
-
-            # if we get here, then the term is a member of our list of terms to be tested.
-            filtered_n_usable[term_id] = n_usable[term_id]
-            filtered_all_counts[term_id] = all_counts[term_id]
-
-        return filtered_n_usable, filtered_all_counts, reason_for_filtering_out
-
     def filter_method_name(self) -> str:
         return "Specified terms MTC filter"
 
@@ -326,13 +273,13 @@ def __init__(
     def filter(
         self,
         gt_predicate: GenotypePolyPredicate,
-        phenotypes: typing.Sequence[hpotk.TermId],
+        ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
         counts: typing.Sequence[pd.DataFrame],
     ) -> typing.Sequence[PhenotypeMtcResult]:
+        phenotypes = [p.phenotype for p in ph_predicates]
         p_to_idx = {p: i for i, p in enumerate(phenotypes)}
         
-        results = [None for _ in range(len(phenotypes))]
-        categories = tuple(gt_predicate.get_categories())
+        results: typing.MutableSequence[typing.Optional[PhenotypeMtcResult]] = [None for _ in range(len(phenotypes))]
         for term_id in self._get_ordered_terms(phenotypes):
             try:
                 idx = p_to_idx[term_id]
@@ -357,10 +304,12 @@ def filter(
             #     reason_for_filtering_out["Skipping non-target term"] += 1
             #     continue
 
+            ph_predicate = ph_predicates[idx]
             contingency_matrix = counts[idx]
             total = contingency_matrix.sum().sum()
             max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
-                contingency_matrix
+                contingency_matrix,
+                ph_predicate=ph_predicate,
             )
             if max_freq < self._hpo_term_frequency_filter:
                 reason = (
@@ -375,23 +324,26 @@ def filter(
                 results[idx] = PhenotypeMtcResult.fail(reason)
                 continue
 
-            # todo -- similar for (3,2)
-            if not HpoMtcFilter.some_cell_has_greater_than_one_count(contingency_matrix):
+            if not HpoMtcFilter.some_cell_has_greater_than_one_count(
+                counts=contingency_matrix,
+                ph_predicate=ph_predicate,
+            ):
                 reason = "Skipping term because no genotype has more than one observed HPO count"
                 results[idx] = PhenotypeMtcResult.fail(reason)
                 continue
-
+            
             elif HpoMtcFilter.genotypes_have_same_hpo_proportions(
                 contingency_matrix,
-                categories,
+                gt_predicate=gt_predicate,
+                ph_predicate=ph_predicate,
             ):
                 reason = "Skipping term because all genotypes have same HPO observed proportions"
                 results[idx] = PhenotypeMtcResult.fail(reason)
                 continue
 
             elif HpoMtcFilter.one_genotype_has_zero_hpo_observations(
-                contingency_matrix,
-                categories,
+                counts=contingency_matrix,
+                gt_predicate=gt_predicate,
             ):
                 reason = "Skipping term because one genotype had zero observations"
                 results[idx] = PhenotypeMtcResult.fail(reason)
@@ -417,78 +369,70 @@ def filter(
                     term_name = self._hpo.get_term_name(phenotypes[i])
                     missing.append(term_name)
             
-            print('BLAAAAAA')
             msg = 'Missing results for {}'.format(', '.join(missing))
             raise ValueError(msg)
 
-        return tuple(results)
+        # Ignoring the type hint, because we checked the type match above.
+        return tuple(results)  # type: ignore
 
     def filter_method_name(self) -> str:
         return "HPO MTC filter"
 
     @staticmethod
-    def get_number_of_observed_hpo_observations(counts_frame: pd.DataFrame) -> int:
-        return counts_frame.loc[PatientCategories.YES].sum()
+    def get_number_of_observed_hpo_observations(
+        counts_frame: pd.DataFrame,
+        ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+    ) -> int:
+        return counts_frame.loc[ph_predicate.present_phenotype_category].sum()
 
     @staticmethod
-    def get_maximum_group_observed_HPO_frequency(counts_frame: pd.DataFrame) -> float:
+    def get_maximum_group_observed_HPO_frequency(
+        counts_frame: pd.DataFrame,
+        ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+    ) -> float:
         """
         Returns:
             The maximum frequency of observed HPO annotations across all genotypes.
         """
-        df = counts_frame.loc[PatientCategories.YES] / (
-            counts_frame.loc[PatientCategories.YES]
-            + counts_frame.loc[PatientCategories.NO]
-        )
-        return df.max()
+        all_hpo_count_per_gt = counts_frame.sum()
+        if (all_hpo_count_per_gt == 0).all():
+            # Prevent division by zeros
+            return 0.
+        
+        present_hpo_count_per_gt = counts_frame.loc[ph_predicate.present_phenotype_category]
+        return (present_hpo_count_per_gt / all_hpo_count_per_gt).max()
 
     @staticmethod
     def one_genotype_has_zero_hpo_observations(
         counts: pd.DataFrame,
-        gt_categories: typing.Sequence[PatientCategory],
+        gt_predicate: GenotypePolyPredicate,
     ):
-        if not isinstance(counts, pd.DataFrame):
-            raise ValueError(
-                f"argument 'counts' must be pandas DataFrame but was {type(counts)}"
-            )
-
-        if counts.shape == (2, 2):
-            assert len(gt_categories) == 2, \
-                f"The counts frame is 2x2 but we found {len(gt_categories)} patient categories!"
-            a, b = gt_categories
-            return (
-                counts.loc[:, a].sum() == 0 or counts.loc[:, b].sum() == 0
-            )
-        elif counts.shape == (2, 3):
-            raise ValueError("(2,3) not yet implemented")
-        else:
-            raise ValueError(
-                f"Did not recognize shape of counts matrix: {counts.shape}"
-            )
+        return any(counts.loc[:, c].sum() == 0 for c in gt_predicate.get_categories())
 
     @staticmethod
-    def some_cell_has_greater_than_one_count(counts: pd.DataFrame) -> bool:
+    def some_cell_has_greater_than_one_count(
+        counts: pd.DataFrame,
+        ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+    ) -> bool:
         """
         If no genotype has more than one HPO count, we do not want to do a test. For instance, if MISSENSE has one
         observed HPO and N excluded, and NOT MISSENSE has zero or one observed HPO, then we will skip the test
+        
         Args:
             counts: pandas DataFrame with counts
+            ph_predicate: the phenotype predicate that produced the counts
 
         Returns: true if at least one of the genotypes has more than one observed HPO count
 
         """
-        if not isinstance(counts, pd.DataFrame):
-            raise ValueError(
-                f"argument 'counts' must be pandas DataFrame but was {type(counts)}"
-            )
-
-        return (counts.loc[PatientCategories.YES] > 1).any()
+        return (counts.loc[ph_predicate.present_phenotype_category] > 1).any()
 
     @staticmethod
     def genotypes_have_same_hpo_proportions(
         counts: pd.DataFrame,
-        gt_categories: typing.Sequence[PatientCategory],
-        delta: float = 0.01,
+        gt_predicate: GenotypePolyPredicate,
+        ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+        delta: float = 5e-4,
     ) -> bool:
         """
         If each genotype has the same proportion of observed HPOs, then we do not want to do a test.
@@ -500,28 +444,20 @@ def genotypes_have_same_hpo_proportions(
 
         Returns: true if the genotypes differ by more than `delta`
         """
-        if not isinstance(counts, pd.DataFrame):
-            raise ValueError(
-                f"argument 'counts' must be pandas DataFrame but was {type(counts)}"
-            )
+        numerators = np.array([
+            counts.loc[ph_predicate.present_phenotype_category, c] for c in gt_predicate.get_categories()
+        ])
+        denominators = np.array([
+            counts.loc[:, c].sum() for c in gt_predicate.get_categories()
+        ])
+        
+        if np.any(denominators == 0):
+            return False
 
-        if counts.shape == (2, 2):
-            assert len(gt_categories) == 2, \
-                f"The counts frame is 2x2 but we found {len(gt_categories)} patient categories!"
-            a, b = gt_categories
-            num1 = counts.loc[PatientCategories.YES, a]
-            denom1 = counts.loc[:, a].sum()
-            num2 = counts.loc[PatientCategories.YES, b]
-            denom2 = counts.loc[:, b].sum()
-            if denom1 == 0 or denom2 == 0:
-                return False
-            return abs(num1 / denom1 - num2 / denom2) < delta
-        elif counts.shape == (2, 3):
-            raise ValueError("(2,3) not implemented yet")
-        else:
-            raise ValueError(
-                f"Did not recognize shape of counts matrix: {counts.shape}"
-            )
+        ratios = numerators / denominators
+        mean_ratio = np.mean(ratios)
+        abs_diff = np.abs(ratios - mean_ratio)
+        return bool(np.all(abs_diff < delta))
 
     def _get_ordered_terms(
         self,
diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
index 43d8c7077..408a9fd9c 100644
--- a/src/gpsea/analysis/pcats/_impl.py
+++ b/src/gpsea/analysis/pcats/_impl.py
@@ -602,7 +602,7 @@ def _compute_result(
         # 2 - Apply MTC filter and select p values to MTC
         mtc_filter_results = self._mtc_filter.filter(
             gt_predicate=gt_predicate,
-            phenotypes=phenotypes,
+            ph_predicates=pheno_predicates,
             counts=all_counts,
         )
         mtc_mask = np.array([r.is_passed() for r in mtc_filter_results])
diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py
index 9a9c496c8..ecfb14018 100644
--- a/src/gpsea/analysis/predicate/phenotype/_pheno.py
+++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py
@@ -84,6 +84,13 @@ def present_phenotype_categorization(self) -> PhenotypeCategorization[P]:
         """
         pass
 
+    @property
+    def present_phenotype_category(self) -> PatientCategory:
+        """
+        Get the patient category that correspond to the group of the patients who exibit the investigated phenotype.
+        """
+        return self.present_phenotype_categorization.category
+
 
 class PropagatingPhenotypePredicate(PhenotypePolyPredicate[hpotk.TermId]):
     """
@@ -121,6 +128,7 @@ def __init__(
             category=PatientCategories.NO,
             phenotype=self._query,
         )
+        # Some tests depend on the order of `self._categorizations`.
         self._categorizations = (self._phenotype_observed, self._phenotype_excluded)
 
     def get_question(self) -> str:
diff --git a/tests/analysis/test_analysis.py b/tests/analysis/test_analysis.py
deleted file mode 100644
index ea5408e28..000000000
--- a/tests/analysis/test_analysis.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import pathlib
-import typing
-
-import hpotk
-import numpy as np
-import pandas as pd
-import pytest
-
-from gpsea.analysis import (
-    apply_predicates_on_patients,
-    CohortAnalysis,
-    configure_cohort_analysis,
-)
-
-from gpsea.analysis.mtc_filter._impl import HpoMtcFilter
-from gpsea.model import *
-from gpsea.model.genome import *
-from gpsea.analysis.predicate import PatientCategories
-from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicate
-from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
-
-
-def test_apply_predicates_on_patients(
-        suox_cohort: Cohort,
-        suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
-        suox_gt_predicate: GenotypePolyPredicate,
-):
-    categories, n_usable, counts = apply_predicates_on_patients(
-        patients=suox_cohort.all_patients,
-        pheno_predicates=suox_pheno_predicates,
-        gt_predicate=suox_gt_predicate,
-    )
-
-    assert isinstance(categories, typing.Collection)
-    assert PatientCategories.YES in categories
-    assert PatientCategories.NO in categories
-    assert len(categories) == 2
-
-    assert isinstance(n_usable, pd.Series)
-    assert len(n_usable) == 5
-
-    assert isinstance(counts, typing.Mapping)
-    assert len(counts) == 5
-
-    seizure_counts = counts[hpotk.TermId.from_curie('HP:0001250')]
-    assert np.array_equal(seizure_counts.to_numpy(), np.array([[17, 11], [7, 0]]))
-
-
-class TestCohortAnalysis:
-
-    @pytest.fixture
-    def cohort_analysis(
-        self,
-        suox_cohort: Cohort,
-        hpo: hpotk.MinimalOntology,
-        tmp_path: pathlib.Path,
-    ) -> CohortAnalysis:
-        return configure_cohort_analysis(
-            cohort=suox_cohort,
-            hpo=hpo,
-            cache_dir=str(tmp_path),
-        )
-
-    def test_analysis_passes_if_variant_predicate_always_returns_false(
-        self,
-        cohort_analysis: CohortAnalysis,
-        always_false_variant_predicate: VariantPredicate,
-    ):
-        results = cohort_analysis.compare_hpo_vs_genotype(
-            predicate=always_false_variant_predicate
-        )
-        assert results is not None
diff --git a/tests/analysis/test_examples.py b/tests/analysis/test_examples.py
index 536f9907c..8f17161d6 100644
--- a/tests/analysis/test_examples.py
+++ b/tests/analysis/test_examples.py
@@ -11,6 +11,8 @@
 from gpsea.model import Cohort, VariantEffect
 
 
+# TODO: remove at some point!
+@pytest.mark.skip('Obsolete tests')
 class TestCohortAnalysis:
 
     def test_compare_by_variant_effect(
diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py
index af6f14392..d904cfe43 100644
--- a/tests/analysis/test_mtc_filter.py
+++ b/tests/analysis/test_mtc_filter.py
@@ -7,9 +7,8 @@
 import pytest
 
 from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter
-from gpsea.analysis.predicate import PatientCategories
 from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
-from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
+from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate
 from gpsea.analysis.pcats import apply_predicates_on_patients
 from gpsea.model import Cohort
 
@@ -41,20 +40,36 @@ def patient_counts(
         return counts
 
     @pytest.fixture(scope='class')
-    def gt_categories(self) -> pd.Index:
-        return pd.Index([PatientCategories.YES, PatientCategories.NO])
+    def gt_predicate(
+        self,
+        suox_gt_predicate: GenotypePolyPredicate,
+    ) -> GenotypePolyPredicate:
+        return suox_gt_predicate
 
     @pytest.fixture(scope='class')
-    def pheno_categories(self) -> pd.Index:
-        return pd.Index([PatientCategories.YES, PatientCategories.NO])
+    def ph_predicate(
+        self,
+        hpo: hpotk.MinimalOntology,
+    ) -> PhenotypePolyPredicate[hpotk.TermId]:
+        """
+        For the purpose of testing counts, let's pretend the counts
+        were created by this predicate.
+        """
+        return PropagatingPhenotypePredicate(
+            hpo=hpo,
+            query=hpotk.TermId.from_curie("HP:0001250"),  # Seizure
+            missing_implies_phenotype_excluded=False,
+        )
 
     @staticmethod
     def prepare_counts_df(
         counts,
-        index: pd.Index,
-        columns: pd.Index,
+        gt_predicate: GenotypePolyPredicate,
+        ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
     ):
         values = np.array(counts).reshape((2, 2))
+        index = pd.Index(gt_predicate.get_categories())
+        columns = pd.Index(ph_predicate.get_categories())
         return pd.DataFrame(data=values, index=index, columns=columns)
 
     @pytest.mark.parametrize(
@@ -67,17 +82,17 @@ def prepare_counts_df(
         ]
     )
     def test_one_genotype_has_zero_hpo_observations(
-            self,
-            counts: typing.Tuple[int],
-            expected: bool,
-            gt_categories: pd.Index,
-            pheno_categories: pd.Index,
+        self,
+        counts: typing.Tuple[int],
+        expected: bool,
+        gt_predicate: GenotypePolyPredicate,
+        ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
     ):
-        counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories)
+        counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate)
 
         actual = HpoMtcFilter.one_genotype_has_zero_hpo_observations(
             counts=counts_df,
-            gt_categories=gt_categories,
+            gt_predicate=gt_predicate,
         )
 
         assert actual == expected
@@ -99,12 +114,15 @@ def test_some_cell_has_greater_than_one_count(
             self,
             counts: typing.Tuple[int],
             expected: bool,
-            gt_categories: pd.Index,
-            pheno_categories: pd.Index,
+            gt_predicate: GenotypePolyPredicate,
+            ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
     ):
-        counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories)
+        counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate)
 
-        actual = HpoMtcFilter.some_cell_has_greater_than_one_count(counts=counts_df)
+        actual = HpoMtcFilter.some_cell_has_greater_than_one_count(
+            counts=counts_df,
+            ph_predicate=ph_predicate,
+        )
 
         assert actual == expected
 
@@ -112,26 +130,55 @@ def test_some_cell_has_greater_than_one_count(
         "counts, expected",
         [
             ((10, 100, 50, 500), True),
+            ((0, 0, 100, 100), True),
+            ((10, 100, 50, 500), True),
+            ((10, 100, 10, 105), False),
             ((95, 60, 144 - 95, 71 - 60), False),
             ((40, 15, 18, 15), False),
         ]
     )
     def test_genotypes_have_same_hpo_proportions(
-            self,
-            counts: typing.Tuple[int],
-            expected: bool,
-            gt_categories: pd.Index,
-            pheno_categories: pd.Index,
+        self,
+        counts: typing.Tuple[int],
+        expected: bool,
+        gt_predicate: GenotypePolyPredicate,
+        ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
     ):
-        counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories)
+        counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate)
 
         actual = HpoMtcFilter.genotypes_have_same_hpo_proportions(
             counts=counts_df,
-            gt_categories=gt_categories,
+            gt_predicate=gt_predicate,
+            ph_predicate=ph_predicate,
         )
 
         assert actual == expected
 
+    @pytest.mark.parametrize(
+        "counts, expected",
+        [
+            ((1, 2, 99, 198), .01),
+            ((1, 3, 99, 197), .015),
+            ((0, 0, 100, 200), 0.),
+            ((0, 0, 0, 0), 0.),
+        ]
+    )
+    def test_get_maximum_group_observed_HPO_frequency(
+        self,
+        counts: typing.Tuple[int],
+        expected: float,
+        gt_predicate: GenotypePolyPredicate,
+        ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+    ):
+        counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate)
+
+        actual = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+            counts_frame=counts_df,
+            ph_predicate=ph_predicate,
+        )
+
+        assert actual == pytest.approx(expected)
+
     def test_filter_terms_to_test(
         self,
         mtc_filter: HpoMtcFilter,
@@ -139,11 +186,9 @@ def test_filter_terms_to_test(
         suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
         patient_counts: typing.Sequence[pd.DataFrame],
     ):
-        phenotypes = [p.phenotype for p in suox_pheno_predicates]
-        
         mtc_report = mtc_filter.filter(
             gt_predicate=suox_gt_predicate,
-            phenotypes=phenotypes,
+            ph_predicates=suox_pheno_predicates,
             counts=patient_counts,
         )
 
@@ -175,11 +220,10 @@ def test_specified_term_mtc_filter(
         specified_filter = SpecifiedTermsMtcFilter(
             terms_to_test=(hpotk.TermId.from_curie("HP:0032350"),),
         )
-        phenotypes = [p.phenotype for p in suox_pheno_predicates]
         
         mtc_report = specified_filter.filter(
             gt_predicate=suox_gt_predicate,
-            phenotypes=phenotypes,
+            ph_predicates=suox_pheno_predicates,
             counts=patient_counts,
         )
         assert isinstance(mtc_report, typing.Sequence)
@@ -221,26 +265,51 @@ def test_min_observed_HPO_threshold(
             in enumerate(suox_pheno_predicates)
         }
         # Ectopia lentis HP:0001083  (6 9  1 2), freqs are 6/15=0.4 and 1/3=0.33
-        ectopia = patient_counts[curie2idx["HP:0001083"]]
-        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ectopia)
+        idx = curie2idx["HP:0001083"]
+        ectopia = patient_counts[idx]
+        ectopia_predicate = suox_pheno_predicates[idx]
+        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+            ectopia,
+            ph_predicate=ectopia_predicate,
+        )
         assert max_f == pytest.approx(0.4, abs=EPSILON)
         
         # Seizure HP:0001250 (17 7 11 0), freqs are 17/24=0.7083 and 11/11=1
-        seizure = patient_counts[curie2idx["HP:0001250"]]
-        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(seizure)
+        idx = curie2idx["HP:0001250"]
+        seizure = patient_counts[idx]
+        seizure_predicate = suox_pheno_predicates[idx]
+        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+            seizure,
+            ph_predicate=seizure_predicate
+        )
         assert max_f == pytest.approx(1.0, abs=EPSILON)
         
         # Sulfocysteinuria HP:0032350 (11 0 2 0), freqs are both 1
-        sulfocysteinuria = patient_counts[curie2idx["HP:0032350"]]
-        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(sulfocysteinuria)
+        idx = curie2idx["HP:0032350"]
+        sulfocysteinuria = patient_counts[idx]
+        sulfocysteinuria_predicate = suox_pheno_predicates[idx]
+        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+            sulfocysteinuria,
+            ph_predicate=sulfocysteinuria_predicate,
+        )
         assert max_f == pytest.approx(1.0, abs=EPSILON)
         
         # Neurodevelopmental delay HP:0012758 (4 13 4 4), freqs are 4/17 = 0.235 and 4/8=0.5
-        ndelay = patient_counts[curie2idx["HP:0012758"]]
-        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ndelay)
+        idx = curie2idx["HP:0012758"]
+        ndelay = patient_counts[idx]
+        ndelay_predicate = suox_pheno_predicates[idx]
+        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+            ndelay,
+            ph_predicate=ndelay_predicate,
+        )
         assert max_f == pytest.approx(0.5, abs=EPSILON)
         
         # Hypertonia HP:0001276 (7 9 4 3) fresa are 7/16=0.4375 and 4/7=0.5714
-        hypertonia = patient_counts[curie2idx["HP:0001276"]]
-        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(hypertonia)
+        idx = curie2idx["HP:0001276"]
+        hypertonia = patient_counts[idx]
+        hypertonia_predicate = suox_pheno_predicates[idx]
+        max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+            hypertonia,
+            ph_predicate=hypertonia_predicate,
+        )
         assert max_f == pytest.approx(0.5714, abs=EPSILON)

From af302c0a61fe5c38c88c893eb834075c1ca16917 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 3 Sep 2024 17:03:59 +0200
Subject: [PATCH 14/16] Implement filtering genotype predicate for choosing the
 genotype categories of interest.

---
 src/gpsea/analysis/predicate/genotype/_api.py | 82 ++++++++++++++++++-
 .../predicate/genotype/test_gt_predicates.py  | 82 +++++++++++++++++++
 2 files changed, 162 insertions(+), 2 deletions(-)

diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py
index 39cf020be..a90bb5c88 100644
--- a/src/gpsea/analysis/predicate/genotype/_api.py
+++ b/src/gpsea/analysis/predicate/genotype/_api.py
@@ -1,7 +1,7 @@
 import abc
 import typing
 
-from gpsea.model import Variant
+from gpsea.model import Patient, Variant
 from .._api import PolyPredicate, Categorization, PatientCategory
 
 
@@ -10,7 +10,85 @@ class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta
     `GenotypePolyPredicate` is a base class for all :class:`PolyPredicate`
     that test the genotype axis.
     """
-    pass
+
+    @staticmethod
+    def filtering_predicate(
+        predicate: "GenotypePolyPredicate",
+        targets: typing.Collection[Categorization],
+    ) -> "GenotypePolyPredicate":
+        """
+        """
+        return FilteringGenotypePolyPredicate.create(
+            predicate=predicate,
+            targets=targets,
+        )
+
+
+class FilteringGenotypePolyPredicate(GenotypePolyPredicate):
+    # NOT PART OF THE PUBLIC API
+
+    @staticmethod
+    def create(
+        predicate: "GenotypePolyPredicate",
+        targets: typing.Collection[Categorization],
+    ) -> "FilteringGenotypePolyPredicate":
+        # At least 2 target categorizations must be provided
+        if len(targets) <= 1:
+            raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}')
+
+        good_boys = tuple(isinstance(cat, Categorization) for cat in targets)
+        if not all(good_boys):
+            offenders = ', '.join(
+                str(i)
+                for i, is_instance
+                in enumerate(good_boys) if not is_instance
+            )
+            raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]')
+
+        # All `allowed` categorizations must in fact be present in the `base` predicate.
+        cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets)
+        if not all(cats_are_in_fact_present):
+            missing = ', '.join(
+                c.category.name
+                for c, is_present
+                in zip(targets, cats_are_in_fact_present) if not is_present
+            )
+            raise ValueError(f'Some from the categories are not present: {missing}')
+        
+        if len(targets) == predicate.n_categorizations():
+            raise ValueError(
+                f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations '
+                f'with the same number ({len(targets)}) of targets'
+            )
+
+        return FilteringGenotypePolyPredicate(
+            predicate=predicate,
+            allowed=targets,
+        )
+
+    def __init__(
+        self,
+        predicate: "GenotypePolyPredicate",
+        allowed: typing.Iterable[Categorization],
+    ):
+        self._predicate = predicate
+        self._allowed = tuple(allowed)
+    
+    def get_categorizations(self) -> typing.Sequence[Categorization]:
+        return self._allowed
+
+    def get_question(self) -> str:
+        return self._predicate.get_question()
+
+    def test(self, patient: Patient) -> typing.Optional[Categorization]:
+        cat = self._predicate.test(patient)
+        if cat in self._allowed:
+            return cat
+        else:
+            return None
+
+    def __repr__(self):
+        return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})"
 
 
 class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta):
diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
index 36904159f..adbcf532d 100644
--- a/tests/analysis/predicate/genotype/test_gt_predicates.py
+++ b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -1,3 +1,5 @@
+import typing
+
 import pytest
 
 from gpsea.model import *
@@ -180,3 +182,83 @@ def test_x_recessive(
         assert categorization is not None
 
         assert categorization.category.name == name
+
+
+class TestPolyPredicate:
+
+    @pytest.fixture(scope="class")
+    def x_recessive_gt_predicate(self) -> GenotypePolyPredicate:
+        affects_suox = VariantPredicates.gene("SUOX")
+        return ModeOfInheritancePredicate.x_recessive(
+            variant_predicate=affects_suox,
+        )
+
+    @pytest.mark.parametrize(
+        "indices, expected",
+        [
+            ((0, 1), 2),
+            ((1, 0), 2),
+            ((1, 2), 2),
+        ],
+    )
+    def test_filtering_predicate(
+        self,
+        indices: typing.Sequence[int],
+        expected: int,
+        x_recessive_gt_predicate: GenotypePolyPredicate,
+    ):
+        cats = x_recessive_gt_predicate.get_categorizations()
+        targets = [cats[i] for i in indices]
+        predicate = GenotypePolyPredicate.filtering_predicate(
+            predicate=x_recessive_gt_predicate,
+            targets=targets,
+        )
+
+        actual = len(predicate.get_categorizations())
+
+        assert actual == expected
+
+    def test_filtering_predicate__explodes_when_not_subsetting(
+        self,
+        x_recessive_gt_predicate: GenotypePolyPredicate,
+    ):
+        with pytest.raises(ValueError) as ve:
+            GenotypePolyPredicate.filtering_predicate(
+                predicate=x_recessive_gt_predicate,
+                targets=x_recessive_gt_predicate.get_categorizations(),
+            )
+
+        assert (
+            ve.value.args[0]
+            == "It makes no sense to subset the a predicate with 4 categorizations with the same number (4) of targets"
+        )
+
+    def test_filtering_predicate__explodes_when_using_random_junk(
+        self,
+        x_recessive_gt_predicate: GenotypePolyPredicate,
+    ):
+        with pytest.raises(ValueError) as ve:
+            GenotypePolyPredicate.filtering_predicate(
+                predicate=x_recessive_gt_predicate,
+                targets=(0, 1),
+            )
+
+        assert (
+            ve.value.args[0]
+            == "The targets at following indices are not categorizations: [0, 1]"
+        )
+
+    def test_filtering_predicate__explodes_when_using_one_category(
+        self,
+        x_recessive_gt_predicate: GenotypePolyPredicate,
+    ):
+        with pytest.raises(ValueError) as ve:
+            GenotypePolyPredicate.filtering_predicate(
+                predicate=x_recessive_gt_predicate,
+                targets=(x_recessive_gt_predicate.get_categorizations()[0],),
+            )
+
+        assert (
+            ve.value.args[0]
+            == "At least 2 target categorizations must be provided but got 1"
+        )

From 21caddc68d76261ca9f494e62d8de818cfdb73b8 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 3 Sep 2024 17:22:05 +0200
Subject: [PATCH 15/16] Improve generating the polypredicate question.

---
 docs/tutorial.rst                               |  2 +-
 docs/user-guide/predicates.rst                  |  8 ++++----
 docs/user-guide/stats.rst                       |  4 ++--
 src/gpsea/analysis/_api.py                      |  2 +-
 src/gpsea/analysis/_gp_analysis.py              |  4 ++--
 src/gpsea/analysis/pcats/_impl.py               |  6 +++---
 src/gpsea/analysis/predicate/_api.py            | 17 ++++++++++++++++-
 src/gpsea/analysis/predicate/genotype/_api.py   |  4 ++--
 .../predicate/genotype/_gt_predicates.py        | 17 +++++++----------
 .../analysis/predicate/phenotype/_pheno.py      |  6 +++---
 .../predicate/genotype/test_gt_predicates.py    |  4 ++--
 11 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 29d8f2d9a..0a583b39e 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -171,7 +171,7 @@ in the individuals of the *TBX5* cohort.
 ...     ),
 ...     group_names=('Missense', 'Frameshift'),
 ... )
->>> gt_predicate.get_question()
+>>> gt_predicate.display_question()
 'Genotype group: Missense, Frameshift'
 
 .. note::
diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst
index 908afa147..6d95dfc5e 100644
--- a/docs/user-guide/predicates.rst
+++ b/docs/user-guide/predicates.rst
@@ -300,7 +300,7 @@ for assigning a patient into a genotype group:
 
 >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
 >>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain)
->>> gt_predicate.get_question()
+>>> gt_predicate.display_question()
 'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT'
 
 The `gt_predicate` can be used in downstream analysis, such as in :class:
@@ -337,7 +337,7 @@ for testing if the individual has at least one missense vs. frameshift vs. synon
 ...     ),
 ...     group_names=('Missense', 'Frameshift', 'Synonymous'),
 ... )
->>> gt_predicate.get_question()
+>>> gt_predicate.display_question()
 'Genotype group: Missense, Frameshift, Synonymous'
 
 
@@ -380,8 +380,8 @@ to test for a presence of `Abnormal lens morphology <https://hpo.jax.org/browse/
 ...     hpo=hpo,
 ...     query=query,
 ... )
->>> pheno_predicate.get_question()
-'Is Abnormal lens morphology present in the patient?'
+>>> pheno_predicate.display_question()
+'Is Abnormal lens morphology present in the patient: Yes, No'
 
 
 TODO: explain ``missing_implies_phenotype_excluded``
diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst
index 16f0060b7..5a8717f3b 100644
--- a/docs/user-guide/stats.rst
+++ b/docs/user-guide/stats.rst
@@ -138,7 +138,7 @@ we expect the autosomal dominant mode of inheritance:
 
 >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
 >>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift)
->>> gt_predicate.get_question()
+>>> gt_predicate.display_question()
 'Which genotype group does the patient fit in: HOM_REF, HET'
 
 `gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group
@@ -409,7 +409,7 @@ The genotype predicate will bin the patient into two groups: a point mutation gr
 ...     predicates=(point_mutation, lof_mutation),
 ...     group_names=('Point', 'LoF'),
 ... )
->>> gt_predicate.get_question()
+>>> gt_predicate.display_question()
 'Genotype group: Point, LoF'
 
 
diff --git a/src/gpsea/analysis/_api.py b/src/gpsea/analysis/_api.py
index ca7bd8e49..b2b7d41d7 100644
--- a/src/gpsea/analysis/_api.py
+++ b/src/gpsea/analysis/_api.py
@@ -189,7 +189,7 @@ def summarize(
         # Column index: multiindex of counts and percentages for all genotype predicate groups
         geno_idx = pd.MultiIndex.from_product(
             iterables=(self._geno_predicate.get_categories(), ('Count', 'Percent')),
-            names=(self._geno_predicate.get_question(), None),
+            names=(self._geno_predicate.get_question_base(), None),
         )
 
         # We'll fill this frame with data
diff --git a/src/gpsea/analysis/_gp_analysis.py b/src/gpsea/analysis/_gp_analysis.py
index 35d143053..f786049ce 100644
--- a/src/gpsea/analysis/_gp_analysis.py
+++ b/src/gpsea/analysis/_gp_analysis.py
@@ -63,11 +63,11 @@ def apply_predicates_on_patients(
                 data=0,
                 index=pd.Index(
                     data=ph_predicate.get_categories(),
-                    name=ph_predicate.get_question(),
+                    name=ph_predicate.get_question_base(),
                 ),
                 columns=pd.Index(
                     data=gt_predicate.get_categories(),
-                    name=gt_predicate.get_question(),
+                    name=gt_predicate.get_question_base(),
                 ),
             )
 
diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
index 408a9fd9c..e8045d2d0 100644
--- a/src/gpsea/analysis/pcats/_impl.py
+++ b/src/gpsea/analysis/pcats/_impl.py
@@ -64,11 +64,11 @@ def apply_predicates_on_patients(
                 data=0,
                 index=pd.Index(
                     data=ph_predicate.get_categories(),
-                    name=ph_predicate.get_question(),
+                    name=ph_predicate.get_question_base(),
                 ),
                 columns=pd.Index(
                     data=gt_predicate.get_categories(),
-                    name=gt_predicate.get_question(),
+                    name=gt_predicate.get_question_base(),
                 ),
             )
 
@@ -198,7 +198,7 @@ def summarize(
         gt_idx = pd.MultiIndex.from_product(
             # TODO: fix the below
             iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")),
-            names=(self._gt_predicate.get_question(), None),
+            names=(self._gt_predicate.get_question_base(), None),
         )
 
         # We'll fill this frame with data
diff --git a/src/gpsea/analysis/predicate/_api.py b/src/gpsea/analysis/predicate/_api.py
index da446a546..534eca18d 100644
--- a/src/gpsea/analysis/predicate/_api.py
+++ b/src/gpsea/analysis/predicate/_api.py
@@ -154,6 +154,12 @@ def get_categories(self) -> typing.Iterator[PatientCategory]:
         """
         return (c.category for c in self.get_categorizations())
 
+    def get_category_names(self) -> typing.Iterator[str]:
+        """
+        Get an iterator with names of the :class:`PatientCategory` items that the predicate can produce.
+        """
+        return (cat.name for cat in self.get_categories())
+
     def get_category(
         self,
         cat_id: int,
@@ -182,12 +188,21 @@ def get_category_name(
         return self.get_category(cat_id).name
 
     @abc.abstractmethod
-    def get_question(self) -> str:
+    def get_question_base(self) -> str:
         """
         Prepare a `str` with the question the predicate can answer.
         """
         pass
 
+    def display_question(self) -> str:
+        """
+        Prepare the question which the predicate can answer.
+
+        The question includes the question base and the category names
+        """
+        cat_names = ', '.join(self.get_category_names())
+        return f'{self.get_question_base()}: {cat_names}'
+
     @abc.abstractmethod
     def test(self, patient: Patient) -> typing.Optional[C]:
         """
diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py
index a90bb5c88..db29c573d 100644
--- a/src/gpsea/analysis/predicate/genotype/_api.py
+++ b/src/gpsea/analysis/predicate/genotype/_api.py
@@ -77,8 +77,8 @@ def __init__(
     def get_categorizations(self) -> typing.Sequence[Categorization]:
         return self._allowed
 
-    def get_question(self) -> str:
-        return self._predicate.get_question()
+    def get_question_base(self) -> str:
+        return self._predicate.get_question_base()
 
     def test(self, patient: Patient) -> typing.Optional[Categorization]:
         cat = self._predicate.test(patient)
diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index 23fc4a5df..4007e8d1b 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -39,7 +39,7 @@ def get_categorizations(self) -> typing.Sequence[Categorization]:
             AlleleCountingGenotypeBooleanPredicate.NO,
         )
 
-    def get_question(self) -> str:
+    def get_question_base(self) -> str:
         return self._allele_counter.get_question()
 
     def test(self, patient: Patient) -> typing.Optional[Categorization]:
@@ -92,13 +92,12 @@ def __init__(
     ):
         self._counters = tuple(counters)
         self._categorizations = tuple(categorizations)
-        group_names = ", ".join(c.category.name for c in self._categorizations)
-        self._question = f"Genotype group: {group_names}"
+        self._question = "Genotype group"
 
     def get_categorizations(self) -> typing.Sequence[Categorization]:
         return self._categorizations
 
-    def get_question(self) -> str:
+    def get_question_base(self) -> str:
         return self._question
 
     def test(self, patient: Patient) -> typing.Optional[Categorization]:
@@ -132,7 +131,7 @@ def __hash__(self) -> int:
         )
 
     def __str__(self) -> str:
-        return self.get_question()
+        return self.get_question_base()
 
     def __repr__(self) -> str:
         return (
@@ -194,7 +193,7 @@ def __init__(
     ):
         self._allele_counter = allele_counter
 
-    def get_question(self) -> str:
+    def get_question_base(self) -> str:
         return self._allele_counter.get_question()
 
     def test(self, patient: Patient) -> typing.Optional[Categorization]:
@@ -558,14 +557,12 @@ def __init__(
         issues = ModeOfInheritancePredicate._check_categorizations(self._categorizations)
         if issues:
             raise ValueError('Cannot create predicate: {}'.format(', '.join(issues)))
-        self._question = 'Which genotype group does the patient fit in: {}'.format(
-            ', '.join(cat.category.name for cat in self._categorizations),
-        )
+        self._question = 'Which genotype group does the patient fit in'
 
     def get_categorizations(self) -> typing.Sequence[Categorization]:
         return self._categorizations
 
-    def get_question(self) -> str:
+    def get_question_base(self) -> str:
         return self._question
 
     def test(
diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py
index ecfb14018..3b2d9a7b4 100644
--- a/src/gpsea/analysis/predicate/phenotype/_pheno.py
+++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py
@@ -131,8 +131,8 @@ def __init__(
         # Some tests depend on the order of `self._categorizations`.
         self._categorizations = (self._phenotype_observed, self._phenotype_excluded)
 
-    def get_question(self) -> str:
-        return f"Is {self._query_label} present in the patient?"
+    def get_question_base(self) -> str:
+        return f"Is {self._query_label} present in the patient"
 
     @property
     def phenotype(self) -> hpotk.TermId:
@@ -222,7 +222,7 @@ def __init__(self, disease_id_query: hpotk.TermId):
             phenotype=disease_id_query,
         )
 
-    def get_question(self) -> str:
+    def get_question_base(self) -> str:
         return f"Was {self._query} diagnosed in the patient"
 
     @property
diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
index adbcf532d..97d4fee06 100644
--- a/tests/analysis/predicate/genotype/test_gt_predicates.py
+++ b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -36,8 +36,8 @@ def test_get_question(
         self,
         predicate: GenotypePolyPredicate,
     ):
-        question = predicate.get_question()
-        assert question == "Genotype group: Point, LoF"
+        question = predicate.get_question_base()
+        assert question == "Genotype group"
 
     def test_get_categorizations(
         self,

From 8ba618e3f71f599fd1629d19b104c307504c1fcd Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 3 Sep 2024 20:46:24 +0200
Subject: [PATCH 16/16] Improve docs for `filtering_predicate`.

---
 docs/user-guide/predicates.rst                | 58 +++++++++++
 .../analysis/predicate/genotype/__init__.py   |  4 +-
 src/gpsea/analysis/predicate/genotype/_api.py | 82 +---------------
 .../predicate/genotype/_gt_predicates.py      | 95 +++++++++++++++++++
 .../predicate/genotype/test_gt_predicates.py  |  9 +-
 .../predicate/genotype/test_predicates.py     |  1 +
 6 files changed, 163 insertions(+), 86 deletions(-)

diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst
index 6d95dfc5e..b224516f5 100644
--- a/docs/user-guide/predicates.rst
+++ b/docs/user-guide/predicates.rst
@@ -306,6 +306,64 @@ for assigning a patient into a genotype group:
 The `gt_predicate` can be used in downstream analysis, such as in :class:
 
 
+.. _filtering-predicate:
+
+Filtering predicate
+===================
+
+Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need
+to consider only a subset of the groups. A `GenotypePolyPredicate`
+created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset
+of the target categorizations of interest.
+
+Example
+-------
+
+Let's suppose we want test the genotype-phenotype association between variants
+that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`,
+and we are specifically interested in comparing the heterozygous variants
+in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous).
+
+First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`
+for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame:
+
+>>> from gpsea.model import VariantEffect
+>>> from gpsea.analysis.predicate.genotype import VariantPredicates
+>>> tx_id = 'NM_1234.5'
+>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \
+...     | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id)
+>>> is_frameshift_or_stop_gain.get_question()
+'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)'
+
+Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive`
+to bin according to a genotype group:
+
+>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
+>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain)
+>>> gt_predicate.display_question()
+'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT'
+
+We see that the `gt_predicate` bins the patients into three groups:
+
+>>> cats = gt_predicate.get_categorizations()
+>>> cats
+(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT))
+
+We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function,
+and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`
+that includes only the categories of interest:
+
+>>> from gpsea.analysis.predicate.genotype import filtering_predicate
+>>> fgt_predicate = filtering_predicate(
+...     predicate=gt_predicate,
+...     targets=(cats[1], cats[2]),
+... )
+>>> fgt_predicate.display_question()
+'Which genotype group does the patient fit in: HET, BIALLELIC_ALT'
+
+
+.. _groups-predicate:
+
 Groups predicate
 ================
 
diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py
index ff8d16e1b..86dd261a2 100644
--- a/src/gpsea/analysis/predicate/genotype/__init__.py
+++ b/src/gpsea/analysis/predicate/genotype/__init__.py
@@ -1,13 +1,13 @@
 from ._api import GenotypePolyPredicate
 from ._api import VariantPredicate
 from ._counter import AlleleCounter
-from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate
+from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, recessive_predicate
 from ._gt_predicates import ModeOfInheritancePredicate
 from ._variant import VariantPredicates, ProteinPredicates
 
 __all__ = [
     'GenotypePolyPredicate',
-    'boolean_predicate', 'groups_predicate', 'recessive_predicate',
+    'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'recessive_predicate',
     'ModeOfInheritancePredicate',
     'AlleleCounter', 'VariantPredicate',
     'VariantPredicates', 'ProteinPredicates',
diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py
index db29c573d..39cf020be 100644
--- a/src/gpsea/analysis/predicate/genotype/_api.py
+++ b/src/gpsea/analysis/predicate/genotype/_api.py
@@ -1,7 +1,7 @@
 import abc
 import typing
 
-from gpsea.model import Patient, Variant
+from gpsea.model import Variant
 from .._api import PolyPredicate, Categorization, PatientCategory
 
 
@@ -10,85 +10,7 @@ class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta
     `GenotypePolyPredicate` is a base class for all :class:`PolyPredicate`
     that test the genotype axis.
     """
-
-    @staticmethod
-    def filtering_predicate(
-        predicate: "GenotypePolyPredicate",
-        targets: typing.Collection[Categorization],
-    ) -> "GenotypePolyPredicate":
-        """
-        """
-        return FilteringGenotypePolyPredicate.create(
-            predicate=predicate,
-            targets=targets,
-        )
-
-
-class FilteringGenotypePolyPredicate(GenotypePolyPredicate):
-    # NOT PART OF THE PUBLIC API
-
-    @staticmethod
-    def create(
-        predicate: "GenotypePolyPredicate",
-        targets: typing.Collection[Categorization],
-    ) -> "FilteringGenotypePolyPredicate":
-        # At least 2 target categorizations must be provided
-        if len(targets) <= 1:
-            raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}')
-
-        good_boys = tuple(isinstance(cat, Categorization) for cat in targets)
-        if not all(good_boys):
-            offenders = ', '.join(
-                str(i)
-                for i, is_instance
-                in enumerate(good_boys) if not is_instance
-            )
-            raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]')
-
-        # All `allowed` categorizations must in fact be present in the `base` predicate.
-        cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets)
-        if not all(cats_are_in_fact_present):
-            missing = ', '.join(
-                c.category.name
-                for c, is_present
-                in zip(targets, cats_are_in_fact_present) if not is_present
-            )
-            raise ValueError(f'Some from the categories are not present: {missing}')
-        
-        if len(targets) == predicate.n_categorizations():
-            raise ValueError(
-                f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations '
-                f'with the same number ({len(targets)}) of targets'
-            )
-
-        return FilteringGenotypePolyPredicate(
-            predicate=predicate,
-            allowed=targets,
-        )
-
-    def __init__(
-        self,
-        predicate: "GenotypePolyPredicate",
-        allowed: typing.Iterable[Categorization],
-    ):
-        self._predicate = predicate
-        self._allowed = tuple(allowed)
-    
-    def get_categorizations(self) -> typing.Sequence[Categorization]:
-        return self._allowed
-
-    def get_question_base(self) -> str:
-        return self._predicate.get_question_base()
-
-    def test(self, patient: Patient) -> typing.Optional[Categorization]:
-        cat = self._predicate.test(patient)
-        if cat in self._allowed:
-            return cat
-        else:
-            return None
-
-    def __repr__(self):
-        return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})"
+    pass
 
 
 class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta):
diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index 4007e8d1b..43a39594a 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -151,6 +151,8 @@ def groups_predicate(
     The genotype groups *should* not overlap.
     In case of an overlap, the patient will be assigned into no group (`None`).
 
+    See the :ref:`groups-predicate` section for an example.
+
     :param predicates: an iterable with at least 2 variant predicates to determine a genotype group.
     :param group_names: an iterable with group names. The number of group names must match the number of predicates.
     """
@@ -182,6 +184,99 @@ def groups_predicate(
     )
 
 
+class FilteringGenotypePolyPredicate(GenotypePolyPredicate):
+    # NOT PART OF THE PUBLIC API
+
+    @staticmethod
+    def create(
+        predicate: "GenotypePolyPredicate",
+        targets: typing.Collection[Categorization],
+    ) -> "FilteringGenotypePolyPredicate":
+        # At least 2 target categorizations must be provided
+        if len(targets) <= 1:
+            raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}')
+
+        good_boys = tuple(isinstance(cat, Categorization) for cat in targets)
+        if not all(good_boys):
+            offenders = ', '.join(
+                str(i)
+                for i, is_instance
+                in enumerate(good_boys) if not is_instance
+            )
+            raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]')
+
+        # All `allowed` categorizations must in fact be present in the `base` predicate.
+        cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets)
+        if not all(cats_are_in_fact_present):
+            missing = ', '.join(
+                c.category.name
+                for c, is_present
+                in zip(targets, cats_are_in_fact_present) if not is_present
+            )
+            raise ValueError(f'Some from the categories are not present: {missing}')
+        
+        if len(targets) == predicate.n_categorizations():
+            raise ValueError(
+                f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations '
+                f'with the same number ({len(targets)}) of targets'
+            )
+
+        return FilteringGenotypePolyPredicate(
+            predicate=predicate,
+            allowed=targets,
+        )
+
+    def __init__(
+        self,
+        predicate: "GenotypePolyPredicate",
+        allowed: typing.Iterable[Categorization],
+    ):
+        self._predicate = predicate
+        self._allowed = tuple(allowed)
+    
+    def get_categorizations(self) -> typing.Sequence[Categorization]:
+        return self._allowed
+
+    def get_question_base(self) -> str:
+        return self._predicate.get_question_base()
+
+    def test(self, patient: Patient) -> typing.Optional[Categorization]:
+        cat = self._predicate.test(patient)
+        if cat in self._allowed:
+            return cat
+        else:
+            return None
+
+    def __repr__(self):
+        return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})"
+
+ 
+def filtering_predicate(
+    predicate: GenotypePolyPredicate,
+    targets: typing.Collection[Categorization],
+) -> GenotypePolyPredicate:
+    """
+    Filtering predicate applies the base `predicate` but only returns the categorizations
+    from the provided `targets` collection.
+
+    This can be useful if only some of the categorizations are interesting.
+    For instance, if we only seek to compare the differences between heterozygous and hemizygous variants,
+    but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups.
+
+    See the :ref:`filtering-predicate` section for an example.
+
+    The `predicate` is checked for being able to produce the all items in `targets`
+    and the `targets` must include at least 2 categorizations.
+
+    :param predicate: the base predicate whose categorizations are subject to filteration.
+    :param targets: the categorizations to retain
+    """
+    return FilteringGenotypePolyPredicate.create(
+        predicate=predicate,
+        targets=targets,
+    )
+
+
 class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate):
     # NOT PART OF THE PUBLIC API
     # TODO: this predicate is a bit weird and I think it should eventually go away.
diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
index 97d4fee06..67bc2d117 100644
--- a/tests/analysis/predicate/genotype/test_gt_predicates.py
+++ b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -6,6 +6,7 @@
 from gpsea.analysis.predicate.genotype import (
     GenotypePolyPredicate,
     groups_predicate,
+    filtering_predicate,
     VariantPredicates,
     VariantPredicate,
     ModeOfInheritancePredicate,
@@ -209,7 +210,7 @@ def test_filtering_predicate(
     ):
         cats = x_recessive_gt_predicate.get_categorizations()
         targets = [cats[i] for i in indices]
-        predicate = GenotypePolyPredicate.filtering_predicate(
+        predicate = filtering_predicate(
             predicate=x_recessive_gt_predicate,
             targets=targets,
         )
@@ -223,7 +224,7 @@ def test_filtering_predicate__explodes_when_not_subsetting(
         x_recessive_gt_predicate: GenotypePolyPredicate,
     ):
         with pytest.raises(ValueError) as ve:
-            GenotypePolyPredicate.filtering_predicate(
+            filtering_predicate(
                 predicate=x_recessive_gt_predicate,
                 targets=x_recessive_gt_predicate.get_categorizations(),
             )
@@ -238,7 +239,7 @@ def test_filtering_predicate__explodes_when_using_random_junk(
         x_recessive_gt_predicate: GenotypePolyPredicate,
     ):
         with pytest.raises(ValueError) as ve:
-            GenotypePolyPredicate.filtering_predicate(
+            filtering_predicate(
                 predicate=x_recessive_gt_predicate,
                 targets=(0, 1),
             )
@@ -253,7 +254,7 @@ def test_filtering_predicate__explodes_when_using_one_category(
         x_recessive_gt_predicate: GenotypePolyPredicate,
     ):
         with pytest.raises(ValueError) as ve:
-            GenotypePolyPredicate.filtering_predicate(
+            filtering_predicate(
                 predicate=x_recessive_gt_predicate,
                 targets=(x_recessive_gt_predicate.get_categorizations()[0],),
             )
diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py
index 2f78d7185..a076adacf 100644
--- a/tests/analysis/predicate/genotype/test_predicates.py
+++ b/tests/analysis/predicate/genotype/test_predicates.py
@@ -202,6 +202,7 @@ def test_protein_feature_id(
 
         assert predicate.test(missense_variant) == expected
 
+
 class TestLogicalVariantPredicate:
     """
     Test that the AND and OR variant predicate combinators work as expected.