From 3f38f152fce9322aa5c86b65db17d490d6468c91 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 27 Aug 2024 21:03:52 +0200 Subject: [PATCH 01/16] Configure the protein metadata service in the `preprocessing` module. --- src/gpsea/analysis/_config.py | 53 ++++++++----------------- src/gpsea/preprocessing/__init__.py | 4 +- src/gpsea/preprocessing/_config.py | 61 ++++++++++++++++++++++++++--- 3 files changed, 74 insertions(+), 44 deletions(-) diff --git a/src/gpsea/analysis/_config.py b/src/gpsea/analysis/_config.py index 3fb67fc77..dd723347c 100644 --- a/src/gpsea/analysis/_config.py +++ b/src/gpsea/analysis/_config.py @@ -1,13 +1,15 @@ +import enum import logging import os import typing -import enum +import warnings + import hpotk from gpsea.config import get_cache_dir_path from gpsea.model import Cohort -from gpsea.preprocessing import ProteinMetadataService, UniprotProteinMetadataService, ProteinAnnotationCache, \ - ProtCachingMetadataService +from gpsea.preprocessing import ProteinMetadataService +from gpsea.preprocessing import configure_default_protein_metadata_service as backup_pms from .mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter from ._api import CohortAnalysis from ._gp_analysis import FisherExactAnalyzer @@ -264,7 +266,11 @@ def configure_cohort_analysis( """ if config is None: config = CohortAnalysisConfiguration() # Use the default config - cache_dir = _configure_cache_dir(cache_dir) + + cache_path = get_cache_dir_path(cache_dir) + os.makedirs(cache_path, exist_ok=True) + + cache_dir = str(cache_path) protein_metadata_service = configure_default_protein_metadata_service(protein_source, cache_dir) mtc_filter: PhenotypeMtcFilter @@ -338,36 +344,9 @@ def configure_default_protein_metadata_service( in current working directory under `.gpsea_cache/protein_cache` and reach out to UNIPROT REST API if a cache entry is missing. """ - cache_dir = _configure_cache_dir(cache_dir) - return _configure_protein_service(protein_fallback=protein_source, cache_dir=cache_dir) - - -def _configure_protein_service( - protein_fallback: str, - cache_dir: str, -) -> ProteinMetadataService: - # (1) ProteinMetadataService - # Setup fallback - protein_fallback = _configure_fallback_protein_service(protein_fallback) - # Setup protein metadata cache - prot_cache_dir = os.path.join(cache_dir, 'protein_cache') - os.makedirs(prot_cache_dir, exist_ok=True) - prot_cache = ProteinAnnotationCache(prot_cache_dir) - # Assemble the final protein metadata service - protein_metadata_service = ProtCachingMetadataService(prot_cache, protein_fallback) - return protein_metadata_service - - -def _configure_fallback_protein_service(protein_fallback: str) -> ProteinMetadataService: - if protein_fallback == 'UNIPROT': - fallback1 = UniprotProteinMetadataService() - else: - raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}') - return fallback1 - - -def _configure_cache_dir(cache_dir: typing.Optional[str]) -> str: - cache_path = get_cache_dir_path(cache_dir) - os.makedirs(cache_path, exist_ok=True) - - return str(cache_path) + # TODO: remove at some point. + warnings.warn( + 'Use gpsea.preprocessing.configure_default_protein_metadata_service` instead', + DeprecationWarning, stacklevel=2, + ) + return backup_pms(protein_source=protein_source, cache_dir=cache_dir) diff --git a/src/gpsea/preprocessing/__init__.py b/src/gpsea/preprocessing/__init__.py index b4367167b..1551b4d63 100644 --- a/src/gpsea/preprocessing/__init__.py +++ b/src/gpsea/preprocessing/__init__.py @@ -4,7 +4,7 @@ from ._config import configure_caching_patient_creator, configure_patient_creator from ._config import load_phenopacket_folder, load_phenopackets from ._config import configure_caching_cohort_creator, configure_cohort_creator -from ._config import configure_protein_metadata_service +from ._config import configure_default_protein_metadata_service, configure_protein_metadata_service from ._generic import DefaultImpreciseSvFunctionalAnnotator from ._patient import PatientCreator, CohortCreator from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator @@ -18,7 +18,7 @@ __all__ = [ 'configure_caching_patient_creator', 'configure_patient_creator', 'configure_caching_cohort_creator', 'configure_cohort_creator', - 'configure_protein_metadata_service', + 'configure_default_protein_metadata_service', 'configure_protein_metadata_service', 'VariantCoordinateFinder', 'FunctionalAnnotator', 'ImpreciseSvFunctionalAnnotator', 'ProteinMetadataService', 'PatientCreator', 'CohortCreator', 'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder', 'load_phenopackets', diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index 67c14eab3..348bd5b1c 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -225,14 +225,65 @@ def configure_protein_metadata_service( In any case, the directory will be created if it does not exist (including any non-existing parents). :param timeout: timeout in seconds for the REST APIs. """ + warnings.warn('Use `configure_default_protein_metadata_service` instead', DeprecationWarning, stacklevel=2) + return configure_default_protein_metadata_service( + cache_dir=cache_dir, + timeout=timeout, + ) + + +def configure_default_protein_metadata_service( + protein_source: typing.Literal['UNIPROT'] = 'UNIPROT', + cache_dir: typing.Optional[str] = None, + timeout: float = 30., +) -> ProteinMetadataService: + """ + Create default protein metadata service that will cache the protein metadata + in current working directory under `.gpsea_cache/protein_cache` + and reach out to UNIPROT REST API if a cache entry is missing. + + :param protein_source: a `str` with the code of the protein data sources (currently accepting just `UNIPROT`). + :param cache_dir: path to the folder where we will cache the results fetched from the remote APIs or `None` + if the data should be cached as described by :func:`~gpsea.config.get_cache_dir_path` function. + In any case, the directory will be created if it does not exist (including any non-existing parents). + :param timeout: timeout in seconds for the REST APIs. + """ cache_dir = _configure_cache_dir(cache_dir) + return _configure_protein_service( + protein_fallback=protein_source, + cache_dir=cache_dir, + timeout=timeout, + ) - protein_cache_dir = os.path.join(cache_dir, "protein_cache") - os.makedirs(protein_cache_dir, exist_ok=True) - cache = ProteinAnnotationCache(protein_cache_dir) - fallback = UniprotProteinMetadataService(timeout=timeout) - return ProtCachingMetadataService(cache=cache, fallback=fallback) +def _configure_protein_service( + protein_fallback: str, + cache_dir: str, + timeout: float, +) -> ProteinMetadataService: + # (1) ProteinMetadataService + # Setup fallback + protein_fallback = _configure_fallback_protein_service( + protein_fallback, + timeout, + ) + # Setup protein metadata cache + prot_cache_dir = os.path.join(cache_dir, 'protein_cache') + os.makedirs(prot_cache_dir, exist_ok=True) + prot_cache = ProteinAnnotationCache(prot_cache_dir) + # Assemble the final protein metadata service + protein_metadata_service = ProtCachingMetadataService(prot_cache, protein_fallback) + return protein_metadata_service + + +def _configure_fallback_protein_service( + protein_fallback: str, + timeout: float, +) -> ProteinMetadataService: + if protein_fallback == 'UNIPROT': + return UniprotProteinMetadataService(timeout) + else: + raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}') def _configure_cache_dir( From d641c0835dc0e5f9c7af055179c1f510d876c648 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 28 Aug 2024 10:47:06 +0200 Subject: [PATCH 02/16] Add `KaryotypicSex`. --- src/gpsea/model/__init__.py | 10 +++--- src/gpsea/model/_base.py | 63 +++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/src/gpsea/model/__init__.py b/src/gpsea/model/__init__.py index 2530b4987..d1b8e8925 100644 --- a/src/gpsea/model/__init__.py +++ b/src/gpsea/model/__init__.py @@ -5,19 +5,21 @@ and we follow with data classes for phenotype, genotype, transcript, and protein info. """ -from ._base import SampleLabels +from ._base import SampleLabels, KaryotypicSex from ._cohort import Cohort, Patient from ._gt import Genotype, Genotypes, Genotyped from ._phenotype import Phenotype, Disease from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata from ._tx import TranscriptCoordinates -from ._variant import VariantCoordinates, ImpreciseSvInfo, VariantInfo, VariantInfoAware, TranscriptAnnotation, TranscriptInfoAware, VariantClass, Variant +from ._variant import VariantCoordinates, ImpreciseSvInfo, VariantInfo, VariantInfoAware, VariantClass, Variant +from ._variant import TranscriptAnnotation, TranscriptInfoAware from ._variant_effects import VariantEffect __all__ = [ - 'Cohort', 'Patient', 'SampleLabels', + 'Cohort', 'Patient', 'SampleLabels', 'KaryotypicSex', 'Phenotype', 'Disease', - 'Variant', 'VariantClass', 'VariantCoordinates', 'ImpreciseSvInfo', 'VariantInfo', 'VariantInfoAware', 'Genotype', 'Genotypes', 'Genotyped', + 'Variant', 'VariantClass', 'VariantCoordinates', 'ImpreciseSvInfo', 'VariantInfo', 'VariantInfoAware', + 'Genotype', 'Genotypes', 'Genotyped', 'TranscriptAnnotation', 'VariantEffect', 'TranscriptInfoAware', 'TranscriptCoordinates', 'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType', ] diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py index 36716831e..31ca49c80 100644 --- a/src/gpsea/model/_base.py +++ b/src/gpsea/model/_base.py @@ -1,8 +1,71 @@ +import enum import typing import hpotk +class KaryotypicSex(enum.Enum): + """ + `KaryotypicSex`, as included in + `Phenopacket Schema `_ + """ + + UNKNOWN_KARYOTYPE = 0 + """ + Untyped or inconclusive karyotyping. + """ + + XX = 1 + """ + Female. + """ + + XY = 2 + """ + Male. + """ + + XO = 3 + """ + Single X chromosome. + """ + + XXY = 4 + """ + Two X and one Y chromosome. + """ + + XXX = 5 + """ + Three X chromosomes + """ + + XXYY = 6 + """ + Two X chromosomes and two Y chromosomes. + """ + + XXXY = 7 + """ + Three X chromosomes and one Y chromosome. + """ + + XXXX = 8 + """ + Four X chromosomes. + """ + + XYY = 9 + """ + One X and two Y chromosomes. + """ + + OTHER_KARYOTYPE = 10 + """ + None of the above types. + """ + + class SampleLabels: """ A data model for subject identifiers. From 9613f9416bc4fde27c5898e4e0593798540ac578 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 28 Aug 2024 21:18:30 +0200 Subject: [PATCH 03/16] Add `Sex` to `Patient`. --- src/gpsea/analysis/_config.py | 2 +- src/gpsea/data/_toy.py | 462 +++++++++++------- src/gpsea/io.py | 8 +- src/gpsea/model/__init__.py | 4 +- src/gpsea/model/_base.py | 57 +-- src/gpsea/model/_cohort.py | 48 +- src/gpsea/preprocessing/_audit.py | 2 +- src/gpsea/preprocessing/_phenopacket.py | 220 ++++++--- tests/analysis/conftest.py | 6 +- tests/analysis/predicate/genotype/conftest.py | 6 +- .../predicate/genotype/test_allele_counter.py | 3 +- tests/analysis/test_pscore.py | 5 +- tests/conftest.py | 83 ++-- tests/test_data/SUOX.json | 35 ++ tests/test_io.py | 1 - 15 files changed, 593 insertions(+), 349 deletions(-) diff --git a/src/gpsea/analysis/_config.py b/src/gpsea/analysis/_config.py index dd723347c..2c6d68c63 100644 --- a/src/gpsea/analysis/_config.py +++ b/src/gpsea/analysis/_config.py @@ -271,7 +271,7 @@ def configure_cohort_analysis( os.makedirs(cache_path, exist_ok=True) cache_dir = str(cache_path) - protein_metadata_service = configure_default_protein_metadata_service(protein_source, cache_dir) + protein_metadata_service = backup_pms(protein_source, cache_dir) mtc_filter: PhenotypeMtcFilter if config.mtc_strategy == MtcStrategy.HPO_MTC: diff --git a/src/gpsea/data/_toy.py b/src/gpsea/data/_toy.py index e9b80234e..24a3b9c4b 100644 --- a/src/gpsea/data/_toy.py +++ b/src/gpsea/data/_toy.py @@ -3,7 +3,7 @@ from gpsea.model import * from gpsea.model.genome import Contig, GenomicRegion, Strand -CONTIG = Contig('1', 'GB_ACC', 'REFSEQ_NAME', 'UCSC_NAME', 1_000) +CONTIG = Contig("1", "GB_ACC", "REFSEQ_NAME", "UCSC_NAME", 1_000) def make_region(start: int, end: int) -> GenomicRegion: @@ -22,189 +22,293 @@ def get_toy_cohort() -> Cohort: # - Spasticity HP:0001257 # - Chronic pancreatitis HP:0006280 - arachnodactyly_T = Phenotype(TermId.from_curie('HP:0001166'), True) - focal_clonic_seizure_T = Phenotype(TermId.from_curie('HP:0002266'), True) - seizure_T = Phenotype(TermId.from_curie('HP:0001250'), True) - spasticity_T = Phenotype(TermId.from_curie('HP:0001257'), True) - arachnodactyly_F = Phenotype(TermId.from_curie('HP:0001166'), False) - focal_clonic_seizure_F = Phenotype(TermId.from_curie('HP:0002266'), False) - seizure_F = Phenotype(TermId.from_curie('HP:0001250'), False) - spasticity_F = Phenotype(TermId.from_curie('HP:0001257'), False) - - Disease_T = Disease(TermId.from_curie('OMIM:001234'), "Test present disease", True) - Disease_F = Disease(TermId.from_curie('OMIM:001234'), "Test absent disease", False) + arachnodactyly_T = Phenotype(TermId.from_curie("HP:0001166"), True) + focal_clonic_seizure_T = Phenotype(TermId.from_curie("HP:0002266"), True) + seizure_T = Phenotype(TermId.from_curie("HP:0001250"), True) + spasticity_T = Phenotype(TermId.from_curie("HP:0001257"), True) + arachnodactyly_F = Phenotype(TermId.from_curie("HP:0001166"), False) + focal_clonic_seizure_F = Phenotype(TermId.from_curie("HP:0002266"), False) + seizure_F = Phenotype(TermId.from_curie("HP:0001250"), False) + spasticity_F = Phenotype(TermId.from_curie("HP:0001257"), False) - snv = Variant.create_variant_from_scratch(VariantCoordinates(make_region(280, 281), 'A', 'G', 0), 'FakeGene', - 'NM_1234.5', 'NM_1234.5:c.180A>G', False, [VariantEffect.MISSENSE_VARIANT], [1], - 'NP_09876.5', 'NP_09876.5:p.Unk200', 60, 60, - Genotypes.from_mapping({ - SampleLabels('A'): Genotype.HETEROZYGOUS, SampleLabels('B'): Genotype.HETEROZYGOUS, - SampleLabels('C'): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels('D'): Genotype.HETEROZYGOUS, SampleLabels('E'): Genotype.HETEROZYGOUS, - SampleLabels('G'): Genotype.HETEROZYGOUS, SampleLabels('J'): Genotype.HETEROZYGOUS, - SampleLabels('K'): Genotype.HETEROZYGOUS, SampleLabels('M'): Genotype.HETEROZYGOUS, - SampleLabels('N'): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels('P'): Genotype.HETEROZYGOUS, - SampleLabels('Q'): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels('R'): Genotype.HETEROZYGOUS, SampleLabels('T'): Genotype.HETEROZYGOUS, - SampleLabels('V'): Genotype.HETEROZYGOUS, SampleLabels('Y'): Genotype.HETEROZYGOUS, - }) - ) - deletion = Variant.create_variant_from_scratch(VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2), - 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', - False, [VariantEffect.FRAMESHIFT_VARIANT], - [2], 'NP_09876.5','NP_09876.5:p.Unk200', 86, 87, - Genotypes.from_mapping({ - SampleLabels('D'): Genotype.HETEROZYGOUS, SampleLabels('F'): Genotype.HETEROZYGOUS, - SampleLabels('G'): Genotype.HETEROZYGOUS, SampleLabels('H'): Genotype.HETEROZYGOUS, - SampleLabels('I'): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels('L'): Genotype.HETEROZYGOUS, SampleLabels('O'): Genotype.HETEROZYGOUS, - SampleLabels('R'): Genotype.HETEROZYGOUS, - SampleLabels('S'): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels('U'): Genotype.HETEROZYGOUS, SampleLabels('W'): Genotype.HETEROZYGOUS, - SampleLabels('X'): Genotype.HETEROZYGOUS, SampleLabels('Z'): Genotype.HETEROZYGOUS, - }) - ) + Disease_T = Disease(TermId.from_curie("OMIM:001234"), "Test present disease", True) + Disease_F = Disease(TermId.from_curie("OMIM:001234"), "Test absent disease", False) + + snv = Variant.create_variant_from_scratch( + VariantCoordinates(make_region(280, 281), "A", "G", 0), + "FakeGene", + "NM_1234.5", + "NM_1234.5:c.180A>G", + False, + [VariantEffect.MISSENSE_VARIANT], + [1], + "NP_09876.5", + "NP_09876.5:p.Unk200", + 60, + 60, + Genotypes.from_mapping( + { + SampleLabels("A"): Genotype.HETEROZYGOUS, + SampleLabels("B"): Genotype.HETEROZYGOUS, + SampleLabels("C"): Genotype.HOMOZYGOUS_ALTERNATE, + SampleLabels("D"): Genotype.HETEROZYGOUS, + SampleLabels("E"): Genotype.HETEROZYGOUS, + SampleLabels("G"): Genotype.HETEROZYGOUS, + SampleLabels("J"): Genotype.HETEROZYGOUS, + SampleLabels("K"): Genotype.HETEROZYGOUS, + SampleLabels("M"): Genotype.HETEROZYGOUS, + SampleLabels("N"): Genotype.HOMOZYGOUS_ALTERNATE, + SampleLabels("P"): Genotype.HETEROZYGOUS, + SampleLabels("Q"): Genotype.HOMOZYGOUS_ALTERNATE, + SampleLabels("R"): Genotype.HETEROZYGOUS, + SampleLabels("T"): Genotype.HETEROZYGOUS, + SampleLabels("V"): Genotype.HETEROZYGOUS, + SampleLabels("Y"): Genotype.HETEROZYGOUS, + } + ), + ) + deletion = Variant.create_variant_from_scratch( + VariantCoordinates(make_region(360, 363), "TTC", "T", -2), + "FakeGene", + "NM_1234.5", + "NM_1234.5:c.261_263del", + False, + [VariantEffect.FRAMESHIFT_VARIANT], + [2], + "NP_09876.5", + "NP_09876.5:p.Unk200", + 86, + 87, + Genotypes.from_mapping( + { + SampleLabels("D"): Genotype.HETEROZYGOUS, + SampleLabels("F"): Genotype.HETEROZYGOUS, + SampleLabels("G"): Genotype.HETEROZYGOUS, + SampleLabels("H"): Genotype.HETEROZYGOUS, + SampleLabels("I"): Genotype.HOMOZYGOUS_ALTERNATE, + SampleLabels("L"): Genotype.HETEROZYGOUS, + SampleLabels("O"): Genotype.HETEROZYGOUS, + SampleLabels("R"): Genotype.HETEROZYGOUS, + SampleLabels("S"): Genotype.HOMOZYGOUS_ALTERNATE, + SampleLabels("U"): Genotype.HETEROZYGOUS, + SampleLabels("W"): Genotype.HETEROZYGOUS, + SampleLabels("X"): Genotype.HETEROZYGOUS, + SampleLabels("Z"): Genotype.HETEROZYGOUS, + } + ), + ) het_dup = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(175, 176), 'T', 'TG', 1), 'FakeGene', 'NM_1234.5', - 'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], 'NP_09876.5', 'NP_09876.5:p.Unk200', 25, 25, - Genotypes.empty()) # Not used in the patients below, hence `empty()`. + VariantCoordinates(make_region(175, 176), "T", "TG", 1), + "FakeGene", + "NM_1234.5", + "NM_1234.5:c.75A>G", + False, + [VariantEffect.FRAMESHIFT_VARIANT], + [1], + "NP_09876.5", + "NP_09876.5:p.Unk200", + 25, + 25, + Genotypes.empty(), + ) # Not used in the patients below, hence `empty()`. hom_dup = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(175, 176), 'T', 'TG', 1),'FakeGene', 'NM_1234.5', - 'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], 'NP_09876.5', 'NP_09876.5:p.Unk200', 25, 25, - Genotypes.empty()) # Not used in the patients below, hence `empty()`. + VariantCoordinates(make_region(175, 176), "T", "TG", 1), + "FakeGene", + "NM_1234.5", + "NM_1234.5:c.75A>G", + False, + [VariantEffect.FRAMESHIFT_VARIANT], + [1], + "NP_09876.5", + "NP_09876.5:p.Unk200", + 25, + 25, + Genotypes.empty(), + ) # Not used in the patients below, hence `empty()`. patients = ( - Patient(SampleLabels('A'), - phenotypes=(arachnodactyly_T, spasticity_F, seizure_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('B'), - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('C'), - phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('D'), - phenotypes=(arachnodactyly_T, spasticity_T, seizure_T), - variants=[snv, deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('E'), - phenotypes=(arachnodactyly_T, spasticity_T, seizure_F), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('F'), - phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('G'), - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv, deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('H'), - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('I'), - phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('J'), - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('K'), - phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('L'), - phenotypes=(arachnodactyly_F, seizure_F, spasticity_F), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('M'), - phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('N'), - phenotypes=(arachnodactyly_F, seizure_T, spasticity_F), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('O'), - phenotypes=(arachnodactyly_F, seizure_F, spasticity_T), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('P'), - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('Q'), - phenotypes=(arachnodactyly_T, seizure_F, spasticity_F), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('R'), - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[snv, deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('S'), - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('T'), - phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('U'), - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('V'), - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('W'), - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('X'), - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T] - ), - Patient(SampleLabels('Y'), - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T] - ), - Patient(SampleLabels('Z'), - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T] - ), + Patient.from_raw_parts( + SampleLabels("A"), + sex=None, + phenotypes=(arachnodactyly_T, spasticity_F, seizure_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("B"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("C"), + sex=None, + phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("D"), + sex=None, + phenotypes=(arachnodactyly_T, spasticity_T, seizure_T), + variants=[snv, deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("E"), + sex=None, + phenotypes=(arachnodactyly_T, spasticity_T, seizure_F), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("F"), + sex=None, + phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("G"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), + variants=[snv, deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("H"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("I"), + sex=None, + phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("J"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("K"), + sex=None, + phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("L"), + sex=None, + phenotypes=(arachnodactyly_F, seizure_F, spasticity_F), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("M"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("N"), + sex=None, + phenotypes=(arachnodactyly_F, seizure_T, spasticity_F), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("O"), + sex=None, + phenotypes=(arachnodactyly_F, seizure_F, spasticity_T), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("P"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("Q"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_F, spasticity_F), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("R"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), + variants=[snv, deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("S"), + sex=None, + phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("T"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("U"), + sex=None, + phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("V"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("W"), + sex=None, + phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("X"), + sex=None, + phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), + variants=[deletion], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("Y"), + sex=None, + phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), + variants=[snv], + diseases=[Disease_T], + ), + Patient.from_raw_parts( + SampleLabels("Z"), + sex=None, + phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), + variants=[deletion], + diseases=[Disease_T], + ), ) return Cohort.from_patients(patients) diff --git a/src/gpsea/io.py b/src/gpsea/io.py index a9b300784..c5b3aea18 100644 --- a/src/gpsea/io.py +++ b/src/gpsea/io.py @@ -23,7 +23,7 @@ def default(self, o): 'tx_annotations': o.tx_annotations, 'genotypes': o.genotypes, } - elif isinstance(o, VariantInfo): + elif isinstance(o, VariantInfo): return { 'variant_coordinates': o.variant_coordinates, 'sv_info': o.sv_info, @@ -87,7 +87,7 @@ def default(self, o): 'label': o.label, 'meta_label': o.meta_label, } - elif isinstance(o, (Genotype, VariantEffect, Strand, VariantClass)): + elif isinstance(o, (Sex, Genotype, VariantEffect, Strand, VariantClass)): # enums return o.name elif isinstance(o, Phenotype): @@ -104,6 +104,7 @@ def default(self, o): elif isinstance(o, Patient): return { 'labels': o.labels, + 'sex': o.sex, 'phenotypes': o.phenotypes, 'diseases': o.diseases, 'variants': o.variants, @@ -161,7 +162,7 @@ def default(self, o): _FEATURE_INFO = ('name', 'region') _PHENOTYPE_FIELDS = ('term_id', 'is_present') _DISEASE_FIELDS = ('term_id', 'name', 'is_observed') -_PATIENT_FIELDS = ('labels', 'phenotypes', 'diseases', 'variants') +_PATIENT_FIELDS = ('labels', 'sex', 'phenotypes', 'diseases', 'variants') _COHORT_FIELDS = ('members', 'excluded_patient_count') @@ -281,6 +282,7 @@ def object_hook(obj: typing.Dict[typing.Any, typing.Any]) -> typing.Any: elif GpseaJSONDecoder._has_all_fields(obj, _PATIENT_FIELDS): return Patient( labels=obj['labels'], + sex=Sex[obj['sex']], phenotypes=obj['phenotypes'], diseases=obj['diseases'], variants=obj['variants'], diff --git a/src/gpsea/model/__init__.py b/src/gpsea/model/__init__.py index d1b8e8925..1d17138c5 100644 --- a/src/gpsea/model/__init__.py +++ b/src/gpsea/model/__init__.py @@ -5,7 +5,7 @@ and we follow with data classes for phenotype, genotype, transcript, and protein info. """ -from ._base import SampleLabels, KaryotypicSex +from ._base import SampleLabels, Sex from ._cohort import Cohort, Patient from ._gt import Genotype, Genotypes, Genotyped from ._phenotype import Phenotype, Disease @@ -16,7 +16,7 @@ from ._variant_effects import VariantEffect __all__ = [ - 'Cohort', 'Patient', 'SampleLabels', 'KaryotypicSex', + 'Cohort', 'Patient', 'SampleLabels', 'Sex', 'Phenotype', 'Disease', 'Variant', 'VariantClass', 'VariantCoordinates', 'ImpreciseSvInfo', 'VariantInfo', 'VariantInfoAware', 'Genotype', 'Genotypes', 'Genotyped', diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py index 31ca49c80..0effeab06 100644 --- a/src/gpsea/model/_base.py +++ b/src/gpsea/model/_base.py @@ -4,65 +4,26 @@ import hpotk -class KaryotypicSex(enum.Enum): - """ - `KaryotypicSex`, as included in - `Phenopacket Schema `_ - """ - - UNKNOWN_KARYOTYPE = 0 - """ - Untyped or inconclusive karyotyping. - """ - - XX = 1 - """ - Female. - """ - - XY = 2 - """ - Male. +class Sex(enum.Enum): """ + `Sex` represents typical “phenotypic sex”, as would be determined by a midwife or physician at birth. - XO = 3 - """ - Single X chromosome. + The definition is aligned with `Phenopacket Schema `_ """ - XXY = 4 + UNKNOWN_SEX = 0 """ - Two X and one Y chromosome. + Not assessed or not available. Maps to `NCIT:C17998`. """ - XXX = 5 - """ - Three X chromosomes - """ - - XXYY = 6 - """ - Two X chromosomes and two Y chromosomes. - """ - - XXXY = 7 - """ - Three X chromosomes and one Y chromosome. - """ - - XXXX = 8 - """ - Four X chromosomes. - """ - - XYY = 9 + FEMALE = 1 """ - One X and two Y chromosomes. + Female sex. Maps to `NCIT:C46113`. """ - OTHER_KARYOTYPE = 10 + MALE = 2 """ - None of the above types. + Male sex. Maps to `NCIT:C46112`. """ diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py index 3209748e8..03fcf08e4 100644 --- a/src/gpsea/model/_cohort.py +++ b/src/gpsea/model/_cohort.py @@ -5,7 +5,7 @@ import hpotk -from ._base import SampleLabels +from ._base import SampleLabels, Sex from ._phenotype import Phenotype, Disease from ._variant import Variant @@ -13,16 +13,49 @@ class Patient: """ `Patient` represents a single investigated individual. + + .. note:: + + We strongly recommend using the :func:`from_raw_parts` static constructor + instead of `__init__`. """ + @staticmethod + def from_raw_parts( + labels: SampleLabels, + sex: typing.Optional[Sex], + phenotypes: typing.Iterable[Phenotype], + diseases: typing.Iterable[Disease], + variants: typing.Iterable[Variant] + ) -> "Patient": + """ + Create `Patient` from the primary data. + """ + if sex is None: + sex = Sex.UNKNOWN_SEX + + return Patient( + labels=labels, + sex=sex, + phenotypes=phenotypes, + diseases=diseases, + variants=variants, + ) + def __init__( self, labels: SampleLabels, + sex: Sex, phenotypes: typing.Iterable[Phenotype], diseases: typing.Iterable[Disease], variants: typing.Iterable[Variant] ): + assert isinstance(labels, SampleLabels) self._labels = labels + + assert isinstance(sex, Sex) + self._sex = sex + self._phenotypes = tuple(phenotypes) self._diseases = tuple(diseases) self._variants = tuple(variants) @@ -41,6 +74,13 @@ def labels(self) -> SampleLabels: """ return self._labels + @property + def sex(self) -> Sex: + """ + Get the "phenotype sex" of the sample. + """ + return self._sex + @property def phenotypes(self) -> typing.Sequence[Phenotype]: """ @@ -89,6 +129,7 @@ def excluded_diseases(self) -> typing.Iterator[Disease]: def __str__(self) -> str: return (f"Patient(" f"labels:{self._labels}, " + f"sex:{self._sex}, " f"variants:{self.variants}, " f"phenotypes:{[pheno.identifier for pheno in self.phenotypes]}, " f"diseases:{[dis.identifier for dis in self.diseases]}") @@ -99,12 +140,13 @@ def __repr__(self) -> str: def __eq__(self, other) -> bool: return (isinstance(other, Patient) and self._labels == other._labels + and self._sex == other._sex and self._variants == other._variants and self._phenotypes == other._phenotypes and self._diseases == other._diseases) def __hash__(self) -> int: - return hash((self._labels, self._variants, self._phenotypes, self._diseases)) + return hash((self._labels, self._sex, self._variants, self._phenotypes, self._diseases)) class Cohort(typing.Sized, typing.Iterable[Patient]): @@ -201,7 +243,7 @@ def list_present_phenotypes( Otherwise, lists only the `top` highest counts Returns: - typing.Sequence[typing.Tuple[str, int]]: A sequence of tuples, formatted (phenotype CURIE, + typing.Sequence[typing.Tuple[str, int]]: A sequence of tuples, formatted (phenotype CURIE, number of patients with that phenotype) """ counter = Counter() diff --git a/src/gpsea/preprocessing/_audit.py b/src/gpsea/preprocessing/_audit.py index f4ab6597d..3bdafe97e 100644 --- a/src/gpsea/preprocessing/_audit.py +++ b/src/gpsea/preprocessing/_audit.py @@ -165,7 +165,7 @@ def __init__(self, label: str): @abc.abstractmethod - def add_subsection(self, label: str): + def add_subsection(self, label: str) -> "Notepad": """ Add a labeled subsection. diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py index abe83fbcc..136825bd7 100644 --- a/src/gpsea/preprocessing/_phenopacket.py +++ b/src/gpsea/preprocessing/_phenopacket.py @@ -4,15 +4,28 @@ import hpotk from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket +import phenopackets.schema.v2.core.individual_pb2 as ppi from phenopackets.schema.v2.core.disease_pb2 import Disease as PPDisease from phenopackets.schema.v2.core.interpretation_pb2 import GenomicInterpretation from phenopackets.vrsatile.v1.vrsatile_pb2 import VcfRecord, VariationDescriptor from phenopackets.vrs.v1.vrs_pb2 import Variation -from gpsea.model import Patient, SampleLabels, Disease -from gpsea.model import VariantClass, VariantCoordinates, ImpreciseSvInfo, VariantInfo, Variant, Genotype, Genotypes +from gpsea.model import SampleLabels, Patient, Sex, Disease +from gpsea.model import ( + VariantClass, + VariantCoordinates, + ImpreciseSvInfo, + VariantInfo, + Variant, + Genotype, + Genotypes, +) from gpsea.model.genome import GenomeBuild, GenomicRegion, Strand -from ._api import VariantCoordinateFinder, FunctionalAnnotator, ImpreciseSvFunctionalAnnotator +from ._api import ( + VariantCoordinateFinder, + FunctionalAnnotator, + ImpreciseSvFunctionalAnnotator, +) from ._audit import Notepad from ._patient import PatientCreator from ._phenotype import PhenotypeCreator @@ -27,14 +40,14 @@ def find_genotype( self, item: GenomicInterpretation, ) -> typing.Optional[Genotype]: - if item.HasField('variant_interpretation'): + if item.HasField("variant_interpretation"): variant_interpretation = item.variant_interpretation - if variant_interpretation.HasField('variation_descriptor'): + if variant_interpretation.HasField("variation_descriptor"): variation_descriptor = variant_interpretation.variation_descriptor - if variation_descriptor.HasField('allelic_state'): + if variation_descriptor.HasField("allelic_state"): genotype = variation_descriptor.allelic_state.label return self._map_geno_genotype_label(genotype) - + return None @staticmethod @@ -85,7 +98,7 @@ def find_coordinates( Args: item (GenomicInterpretation): a genomic interpretation element from Phenopacket Schema - + Returns: typing.Optional[VariantCoordinates]: variant coordinates """ @@ -102,8 +115,9 @@ def find_coordinates( variation_descriptor.vcf_record.genome_assembly ): raise ValueError( - f"Variant id {variation_descriptor.id} for patient {item.subject_or_biosample_id} has a different Genome Assembly than what was given. " - + f"{variation_descriptor.vcf_record.genome_assembly} is not {self._build.identifier}." + f"Variant id {variation_descriptor.id} for patient {item.subject_or_biosample_id} " + "has a different Genome Assembly than what was given. " + f"{variation_descriptor.vcf_record.genome_assembly} is not {self._build.identifier}." ) contig = self._build.contig_by_name(variation_descriptor.vcf_record.chrom) start = int(variation_descriptor.vcf_record.pos) - 1 @@ -134,7 +148,9 @@ def find_coordinates( alt = "" change_length = end - start else: - raise ValueError(f"The copy number of {number} is not supported. Supported values: {{1, 3}}") + raise ValueError( + f"The copy number of {number} is not supported. Supported values: {{1, 3}}" + ) region = GenomicRegion(contig, start, end, Strand.POSITIVE) return VariantCoordinates(region, ref, alt, change_length) @@ -197,7 +213,7 @@ def _looks_like_large_sv( else None ) - # If we have these fields, we seem to have all information + # If we have these fields, we seem to have all information # to parse the variation descriptor elsewhere. return structural_type is not None and gene_context is not None @@ -214,7 +230,19 @@ def __init__( functional_annotator: FunctionalAnnotator, imprecise_sv_functional_annotator: ImpreciseSvFunctionalAnnotator, hgvs_coordinate_finder: VariantCoordinateFinder[str], + assume_karyotypic_sex: bool = True, ): + """ + Create an instance. + + :param build: the genome build to use for variants. + :param phenotype_creator: a phenotype creator for creating phenotypes. + :param functional_annotator: for computing functional annotations. + :param imprecise_sv_functional_annotator: for getting info about imprecise variants. + :param hgvs_coordinate_finder: for finding chromosomal coordinates for HGVS variant descriptions. + :param assume_karyotypic_sex: `True` if it is OK to assume that `FEMALE` has the `XX` karyotype + and `MALE` has `XY`. + """ self._logger = logging.getLogger(__name__) # Violates DI, but it is specific to this class, so I'll leave it "as is". self._coord_finder = PhenopacketVariantCoordinateFinder( @@ -228,49 +256,64 @@ def __init__( functional_annotator, FunctionalAnnotator, "functional_annotator" ) self._imprecise_sv_functional_annotator = hpotk.util.validate_instance( - imprecise_sv_functional_annotator, ImpreciseSvFunctionalAnnotator, "imprecise_sv_functional_annotator" + imprecise_sv_functional_annotator, + ImpreciseSvFunctionalAnnotator, + "imprecise_sv_functional_annotator", ) - + # Set of sequence ontology IDs that we will treat as a deletion (`DEL`) # for the purpose of assigning imprecise SV info with a variant class. self._so_deletions = { - '1000029', # chromosomal deletion: An incomplete chromosome. - '0001893', # transcript ablation: A feature ablation whereby the deleted region includes a transcript feature. - '0001879', # feature_ablation: A sequence variant, caused by an alteration of the genomic sequence, where the deletion, is greater than the extent of the underlying genomic features. + "1000029", # chromosomal deletion: An incomplete chromosome. + # transcript ablation: A feature ablation whereby the deleted region includes a transcript feature. + "0001893", + # feature_ablation: A sequence variant, caused by an alteration of the genomic sequence, + # where the deletion, is greater than the extent of the underlying genomic features. + "0001879", } self._so_duplications = { - '1000037', # chromosomal_duplication + "1000037", # chromosomal_duplication } + self._assume_karyotypic_sex = assume_karyotypic_sex - def process(self, inputs: Phenopacket, notepad: Notepad) -> Patient: + def process(self, pp: Phenopacket, notepad: Notepad) -> Patient: """Creates a Patient from the data in a given Phenopacket Args: - inputs (Phenopacket): A Phenopacket object + pp (Phenopacket): A Phenopacket object notepad (Notepad): notepad to write down the issues Returns: Patient: A Patient object """ sample_id = SampleLabels( - label=inputs.subject.id, - meta_label=inputs.id if len(inputs.id) > 0 else None, + label=pp.subject.id, + meta_label=pp.id if len(pp.id) > 0 else None, ) + # Extract karyotypic sex + indi = notepad.add_subsection("individual") + sex = self._extract_sex(pp.subject, indi) + # Check phenotypes pfs = notepad.add_subsection("phenotype-features") phenotypes = self._phenotype_creator.process( - ((pf.type.id, not pf.excluded) for pf in inputs.phenotypic_features), pfs + ((pf.type.id, not pf.excluded) for pf in pp.phenotypic_features), pfs ) # Check diseases - diseases = self._add_diseases([dis for dis in inputs.diseases], pfs) + dip = notepad.add_subsection("diseases") + diseases = self._add_diseases(pp.diseases, dip) # Check variants vs = notepad.add_subsection("variants") - variants = self._add_variants(sample_id, inputs, vs) - - return Patient( - sample_id, phenotypes=phenotypes, variants=variants, diseases=diseases + variants = self._add_variants(sample_id, pp, vs) + + return Patient.from_raw_parts( + sample_id, + sex=sex, + phenotypes=phenotypes, + variants=variants, + diseases=diseases, ) def _add_diseases( @@ -285,21 +328,43 @@ def _add_diseases( Sequence[Dis]: A list of Disease objects """ if len(diseases) == 0: - notepad.add_warning(f"No diseases found.") - return [] + notepad.add_warning("No diseases found") + return () + final_diseases = [] for i, dis in enumerate(diseases): if not dis.HasField("term"): - raise ValueError("Could not find term in Disease.") - term_id = hpotk.TermId.from_curie(dis.term.id) + notepad.add_error(f"#{i} has no `term`") + continue + else: + term_id = hpotk.TermId.from_curie(dis.term.id) + # Do not include excluded diseases if we decide to assume excluded if not included final_diseases.append(Disease(term_id, dis.term.label, not dis.excluded)) + return final_diseases + def _extract_sex( + self, + individual: ppi.Individual, + notepad: Notepad, + ) -> typing.Optional[Sex]: + # Let's use the phenotypic sex as fallback + sex = individual.sex + if sex == ppi.FEMALE: + return Sex.FEMALE + elif sex == ppi.MALE: + return Sex.MALE + elif sex == ppi.OTHER_SEX or sex == ppi.UNKNOWN_SEX: + return Sex.UNKNOWN_SEX + else: + notepad.add_warning(f'Unknown sex type: {sex}') + return Sex.UNKNOWN_SEX + def _add_variants( - self, - sample_id: SampleLabels, - pp: Phenopacket, + self, + sample_id: SampleLabels, + pp: Phenopacket, notepad: Notepad, ) -> typing.Sequence[Variant]: """Creates a list of Variant objects from the data in a given Phenopacket @@ -311,24 +376,28 @@ def _add_variants( Sequence[Variant]: A list of Variant objects """ variants = [] - + for i, interpretation in enumerate(pp.interpretations): - sub_note = notepad.add_subsection(f'#{i}') - if interpretation.HasField('diagnosis'): - for genomic_interpretation in interpretation.diagnosis.genomic_interpretations: + sub_note = notepad.add_subsection(f"#{i}") + if interpretation.HasField("diagnosis"): + for ( + genomic_interpretation + ) in interpretation.diagnosis.genomic_interpretations: gt = self._gt_parser.find_genotype(genomic_interpretation) if gt is None: sub_note.add_warning( - f"Could not extract genotype from genomic interpretation", + "Could not extract genotype from genomic interpretation", "Remove variant from testing", ) continue - - variant_info = self._extract_variant_info(sample_id, genomic_interpretation, sub_note) + + variant_info = self._extract_variant_info( + sample_id, genomic_interpretation, sub_note + ) if variant_info is None: # We already complained in the extract function continue - + if variant_info.has_variant_coordinates(): try: tx_annotations = self._functional_annotator.annotate( @@ -342,8 +411,10 @@ def _add_variants( continue elif variant_info.has_sv_info(): try: - tx_annotations = self._imprecise_sv_functional_annotator.annotate( - item=variant_info.sv_info, + tx_annotations = ( + self._imprecise_sv_functional_annotator.annotate( + item=variant_info.sv_info, + ) ) except ValueError as error: sub_note.add_warning( @@ -352,23 +423,25 @@ def _add_variants( ) continue else: - raise ValueError('VariantInfo should have either the coordinates or the SV info, but had neither!') + raise ValueError( + "VariantInfo should have either the coordinates or the SV info, but had neither!" + ) if len(tx_annotations) == 0: sub_note.add_warning( f"Patient {pp.id} has an error with variant {variant_info.variant_key}", - f"Remove variant from testing... tx_anno == 0", + "Remove variant from testing... tx_anno == 0", ) continue - genotype = Genotypes.single(sample_id, gt) variants.append( Variant( - variant_info=variant_info, - tx_annotations=tx_annotations, + variant_info=variant_info, + tx_annotations=tx_annotations, genotypes=genotype, - )) + ) + ) if len(variants) == 0: notepad.add_warning(f"Patient {pp.id} has no variants to work with") @@ -385,39 +458,50 @@ def _extract_variant_info( sv_info = None try: - variant_coordinates = self._coord_finder.find_coordinates(genomic_interpretation) + variant_coordinates = self._coord_finder.find_coordinates( + genomic_interpretation + ) except ValueError: notepad.add_warning( - f"Expected a VCF record, a VRS CNV, or an expression with `hgvs.c` but had an error retrieving any from patient {sample_id}", + "Expected a VCF record, a VRS CNV, or an expression with `hgvs.c`" + f"but had an error retrieving any from patient {sample_id}", "Remove variant from testing", ) return None - + if variant_coordinates is None: sv_info = self._map_to_imprecise_sv(genomic_interpretation) if sv_info is None: notepad.add_warning( - f"Could not extract the information for large SV annotation", + "Could not extract the information for large SV annotation", "Remove variant from testing", ) return None - + return VariantInfo( variant_coordinates=variant_coordinates, sv_info=sv_info, ) def _map_to_imprecise_sv( - self, + self, genomic_interpretation: GenomicInterpretation, ) -> typing.Optional[ImpreciseSvInfo]: - if genomic_interpretation.HasField('variant_interpretation'): + if genomic_interpretation.HasField("variant_interpretation"): variant_interpretation = genomic_interpretation.variant_interpretation - if variant_interpretation.HasField('variation_descriptor'): + if variant_interpretation.HasField("variation_descriptor"): variation_descriptor = variant_interpretation.variation_descriptor - - structural_type = variation_descriptor.structural_type if variation_descriptor.HasField('structural_type') else None - gene_context = variation_descriptor.gene_context if variation_descriptor.HasField('gene_context') else None + + structural_type = ( + variation_descriptor.structural_type + if variation_descriptor.HasField("structural_type") + else None + ) + gene_context = ( + variation_descriptor.gene_context + if variation_descriptor.HasField("gene_context") + else None + ) if structural_type is not None and gene_context is not None: st = hpotk.TermId.from_curie(curie=structural_type.id) @@ -428,23 +512,23 @@ def _map_to_imprecise_sv( gene_id=gene_context.value_id, gene_symbol=gene_context.symbol, ) - + return None - + def _map_structural_type_to_variant_class( self, structural_type: hpotk.TermId, ) -> VariantClass: # This method is most likely incomplete. - # Please open a ticket if you receive a `ValueError` + # Please open a ticket if you receive a `ValueError` # for a structural type, that is not mapped at the moment, # to help us enhance the mapping. - if structural_type.prefix == 'SO': + if structural_type.prefix == "SO": if structural_type.id in self._so_deletions: return VariantClass.DEL elif structural_type.id in self._so_duplications: return VariantClass.DUP else: - raise ValueError(f'Unknown structural type {structural_type}') + raise ValueError(f"Unknown structural type {structural_type}") else: - raise ValueError(f'Unknown structural type {structural_type}') + raise ValueError(f"Unknown structural type {structural_type}") diff --git a/tests/analysis/conftest.py b/tests/analysis/conftest.py index 86dd01c43..90b46c8d8 100644 --- a/tests/analysis/conftest.py +++ b/tests/analysis/conftest.py @@ -35,8 +35,9 @@ def degenerated_cohort( return Cohort.from_patients( members=( - Patient( + Patient.from_raw_parts( labels=labels_a, + sex=Sex.UNKNOWN_SEX, phenotypes=( Phenotype( term_id=hpotk.TermId.from_curie("HP:0000118"), @@ -69,8 +70,9 @@ def degenerated_cohort( ), ), ), - Patient( + Patient.from_raw_parts( labels=labels_b, + sex=Sex.UNKNOWN_SEX, phenotypes=( Phenotype( term_id=hpotk.TermId.from_curie("HP:0000118"), diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py index 22b6d5882..3043d3419 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/predicate/genotype/conftest.py @@ -187,8 +187,9 @@ def patient_w_missense( sample_labels: SampleLabels, missense_variant: Variant, ) -> Patient: - return Patient( + return Patient.from_raw_parts( labels=sample_labels, + sex=Sex.UNKNOWN_SEX, phenotypes=(), diseases=(), variants=( @@ -202,8 +203,9 @@ def patient_w_frameshift( sample_labels: SampleLabels, frameshift_variant: Variant, ) -> Patient: - return Patient( + return Patient.from_raw_parts( labels=sample_labels, + sex=Sex.UNKNOWN_SEX, phenotypes=(), diseases=(), variants=( diff --git a/tests/analysis/predicate/genotype/test_allele_counter.py b/tests/analysis/predicate/genotype/test_allele_counter.py index 0b4355012..506ec3023 100644 --- a/tests/analysis/predicate/genotype/test_allele_counter.py +++ b/tests/analysis/predicate/genotype/test_allele_counter.py @@ -175,8 +175,9 @@ def patient( hom_alt_lmna: Variant, hemi_dmd: Variant, ) -> Patient: - return Patient( + return Patient.from_raw_parts( labels=sample_labels, + sex=Sex.UNKNOWN_SEX, variants=( het_lmna, hom_alt_lmna, diff --git a/tests/analysis/test_pscore.py b/tests/analysis/test_pscore.py index 9b6329029..a4c1c24be 100644 --- a/tests/analysis/test_pscore.py +++ b/tests/analysis/test_pscore.py @@ -3,7 +3,7 @@ import hpotk import pytest -from gpsea.model import Patient, Phenotype, SampleLabels +from gpsea.model import Patient, Phenotype, SampleLabels, Sex from gpsea.analysis.pscore import CountingPhenotypeScorer @@ -50,8 +50,9 @@ def test_a_patient( expected: int, counting_scorer: CountingPhenotypeScorer, ): - patient = Patient( + patient = Patient.from_raw_parts( labels=SampleLabels("test"), + sex=Sex.UNKNOWN_SEX, phenotypes=( Phenotype( hpotk.TermId.from_curie(curie), diff --git a/tests/conftest.py b/tests/conftest.py index 54f012ecc..b66b55e7d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -268,42 +268,53 @@ def toy_cohort( genotypes=Genotypes.from_mapping({SampleLabels('LargeCNV'): Genotype.HETEROZYGOUS})) patients = ( - Patient(SampleLabels('HetSingleVar'), - phenotypes=( - test_phenotypes['arachnodactyly_T'], - test_phenotypes['spasticity_F'], - test_phenotypes['focal_clonic_seizure_T']), - variants=(dup,), - diseases=(test_diseases['KBG_T'],) - ), - Patient(SampleLabels('HetDoubleVar1'), - phenotypes=( - test_phenotypes['arachnodactyly_T'], test_phenotypes['seizure_T'], test_phenotypes['spasticity_T'], - ), - variants=(indel, snv_stop_gain), - diseases=(test_diseases['KBG_T'],) - ), - Patient(SampleLabels('HetDoubleVar2'), - phenotypes=( - test_phenotypes['arachnodactyly_F'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'], - ), - variants=(snv_missense, del_frameshift), - diseases=(test_diseases['KBG_T'],) - ), - Patient(SampleLabels('HomoVar'), - phenotypes=( - test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'], - ), - variants=(del_small,), - diseases=() - ), - Patient(SampleLabels('LargeCNV'), - phenotypes=( - test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_F'], - ), - variants=(del_large,), - diseases=() - ), + Patient.from_raw_parts( + SampleLabels('HetSingleVar'), + sex=Sex.UNKNOWN_SEX, + phenotypes=( + test_phenotypes['arachnodactyly_T'], + test_phenotypes['spasticity_F'], + test_phenotypes['focal_clonic_seizure_T'] + ), + variants=(dup,), + diseases=(test_diseases['KBG_T'],) + ), + Patient.from_raw_parts( + SampleLabels('HetDoubleVar1'), + sex=Sex.UNKNOWN_SEX, + phenotypes=( + test_phenotypes['arachnodactyly_T'], test_phenotypes['seizure_T'], test_phenotypes['spasticity_T'], + ), + variants=(indel, snv_stop_gain), + diseases=(test_diseases['KBG_T'],) + ), + Patient.from_raw_parts( + SampleLabels('HetDoubleVar2'), + sex=Sex.UNKNOWN_SEX, + phenotypes=( + test_phenotypes['arachnodactyly_F'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'], + ), + variants=(snv_missense, del_frameshift), + diseases=(test_diseases['KBG_T'],) + ), + Patient.from_raw_parts( + SampleLabels('HomoVar'), + sex=Sex.UNKNOWN_SEX, + phenotypes=( + test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'], + ), + variants=(del_small,), + diseases=() + ), + Patient.from_raw_parts( + SampleLabels('LargeCNV'), + sex=Sex.UNKNOWN_SEX, + phenotypes=( + test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_F'], + ), + variants=(del_large,), + diseases=() + ), ) return Cohort.from_patients(patients) diff --git a/tests/test_data/SUOX.json b/tests/test_data/SUOX.json index d85873adc..083f77dd6 100644 --- a/tests/test_data/SUOX.json +++ b/tests/test_data/SUOX.json @@ -5,6 +5,7 @@ "label": "individual_10_PMID_12112661", "meta_label": "PMID_36303223_individual_10_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -141,6 +142,7 @@ "label": "individual_11_PMID_12112661", "meta_label": "PMID_36303223_individual_11_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -392,6 +394,7 @@ "label": "individual_12_PMID_12112661", "meta_label": "PMID_36303223_individual_12_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -528,6 +531,7 @@ "label": "individual_13_PMID_12112661", "meta_label": "PMID_36303223_individual_13_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -664,6 +668,7 @@ "label": "individual_14_PMID_11825068", "meta_label": "PMID_36303223_individual_14_PMID_11825068" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0034332", @@ -840,6 +845,7 @@ "label": "individual_15_PMID_12001203", "meta_label": "PMID_36303223_individual_15_PMID_12001203" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -1000,6 +1006,7 @@ "label": "individual_16_PMID_12368985", "meta_label": "PMID_36303223_individual_16_PMID_12368985" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -1283,6 +1290,7 @@ "label": "individual_17_PMID_15952210", "meta_label": "PMID_36303223_individual_17_PMID_15952210" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0012758", @@ -1459,6 +1467,7 @@ "label": "individual_18_PMID_23414711", "meta_label": "PMID_36303223_individual_18_PMID_23414711" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0012758", @@ -1639,6 +1648,7 @@ "label": "individual_19_PMID_23452914", "meta_label": "PMID_36303223_individual_19_PMID_23452914" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0002071", @@ -1827,6 +1837,7 @@ "label": "individual_1_PMID_9050047", "meta_label": "PMID_36303223_individual_1_PMID_9050047" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0012758", @@ -1999,6 +2010,7 @@ "label": "individual_20_PMID_24938149", "meta_label": "PMID_36303223_individual_20_PMID_24938149" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -2270,6 +2282,7 @@ "label": "individual_21_PMID_27289259", "meta_label": "PMID_36303223_individual_21_PMID_27289259" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0012758", @@ -2442,6 +2455,7 @@ "label": "individual_22_PMID_27289259", "meta_label": "PMID_36303223_individual_22_PMID_27289259" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0012758", @@ -2614,6 +2628,7 @@ "label": "individual_23_PMID_28725568", "meta_label": "PMID_36303223_individual_23_PMID_28725568" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -2897,6 +2912,7 @@ "label": "individual_24_PMID_28629418", "meta_label": "PMID_36303223_individual_24_PMID_28629418" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -3065,6 +3081,7 @@ "label": "individual_25_PMID_31870341", "meta_label": "PMID_36303223_individual_25_PMID_31870341" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0034332", @@ -3348,6 +3365,7 @@ "label": "individual_26_PMID_31870341", "meta_label": "PMID_36303223_individual_26_PMID_31870341" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0034332", @@ -3631,6 +3649,7 @@ "label": "individual_27_PMID_31870341", "meta_label": "PMID_36303223_individual_27_PMID_31870341" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0034332", @@ -3914,6 +3933,7 @@ "label": "individual_28_PMID_33335014", "meta_label": "PMID_36303223_individual_28_PMID_33335014" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0012758", @@ -4086,6 +4106,7 @@ "label": "individual_29_PMID_33335014", "meta_label": "PMID_36303223_individual_29_PMID_33335014" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0012758", @@ -4262,6 +4283,7 @@ "label": "individual_2_PMID_9600976", "meta_label": "PMID_36303223_individual_2_PMID_9600976" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0012758", @@ -4434,6 +4456,7 @@ "label": "individual_30_PMID_31806255", "meta_label": "PMID_36303223_individual_30_PMID_31806255" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0034332", @@ -4622,6 +4645,7 @@ "label": "individual_31_PMID_31806255", "meta_label": "PMID_36303223_individual_31_PMID_31806255" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0034332", @@ -4810,6 +4834,7 @@ "label": "individual_32_PMID_34025712", "meta_label": "PMID_36303223_individual_32_PMID_34025712" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -5097,6 +5122,7 @@ "label": "individual_33_PMID_33405344", "meta_label": "PMID_36303223_individual_33_PMID_33405344" }, + "sex": "FEMALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -5368,6 +5394,7 @@ "label": "individual_34_PMID_33405344", "meta_label": "PMID_36303223_individual_34_PMID_33405344" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -5639,6 +5666,7 @@ "label": "individual_35_PMID_36303223", "meta_label": "PMID_36303223_individual_35_PMID_36303223" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0001083", @@ -5938,6 +5966,7 @@ "label": "individual_3_PMID_10519592", "meta_label": "PMID_36303223_individual_3_PMID_10519592" }, + "sex": "MALE", "phenotypes": [ { "term_id": "HP:0001250", @@ -6229,6 +6258,7 @@ "label": "individual_4_PMID_12112661", "meta_label": "PMID_36303223_individual_4_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -6365,6 +6395,7 @@ "label": "individual_5_PMID_12112661", "meta_label": "PMID_36303223_individual_5_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -6616,6 +6647,7 @@ "label": "individual_6_PMID_12112661", "meta_label": "PMID_36303223_individual_6_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -6752,6 +6784,7 @@ "label": "individual_7_PMID_12112661", "meta_label": "PMID_36303223_individual_7_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -6888,6 +6921,7 @@ "label": "individual_8_PMID_12112661", "meta_label": "PMID_36303223_individual_8_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", @@ -7024,6 +7058,7 @@ "label": "individual_9_PMID_12112661", "meta_label": "PMID_36303223_individual_9_PMID_12112661" }, + "sex": "UNKNOWN_SEX", "phenotypes": [ { "term_id": "HP:0001250", diff --git a/tests/test_io.py b/tests/test_io.py index a479dceed..7d45cabdd 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -24,7 +24,6 @@ def test_regenerate_cohort( ): """ The test for regenerating the `SUOX.json` file based on a cohort of phenopackets. - The test needs path to a folder with phenopacket JSON files (empty `str` below). Note, the test may need to be run multiple times if the ENSEMBL API times out. """ From 202be91a8acf4141f79f5e413b32054a125dfd20 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 29 Aug 2024 15:10:08 +0200 Subject: [PATCH 04/16] Implement `ModeOfInheritanceParedicate`. --- src/gpsea/analysis/predicate/_api.py | 31 +- .../analysis/predicate/genotype/__init__.py | 2 + .../predicate/genotype/_gt_predicates.py | 424 ++++++++++++++++-- src/gpsea/model/_base.py | 24 + tests/analysis/predicate/genotype/conftest.py | 400 ++++++++++++++++- .../predicate/genotype/test_gt_predicates.py | 104 ++++- 6 files changed, 941 insertions(+), 44 deletions(-) diff --git a/src/gpsea/analysis/predicate/_api.py b/src/gpsea/analysis/predicate/_api.py index 8c62e1910..da446a546 100644 --- a/src/gpsea/analysis/predicate/_api.py +++ b/src/gpsea/analysis/predicate/_api.py @@ -1,6 +1,8 @@ import abc import typing +from collections import Counter + import hpotk.util from gpsea.model import Patient @@ -136,9 +138,7 @@ class PolyPredicate(typing.Generic[C], metaclass=abc.ABCMeta): and *exhaustive* - the groups must cover all possible scenarios. However, if the patient cannot be assigned into any meaningful category, `None` can be returned. - - Note, that `PolyPredicate` must be used on a happy path - testing a patient must inherently make sense. - Predicate will *not* check if, for instance, the patient variants are compatible with a certain mode of inheritance. + As a rule of thumb, returning `None` will exclude the patient from the analysis. """ @abc.abstractmethod @@ -210,3 +210,28 @@ def _check_patient(patient: Patient): """ if not isinstance(patient, Patient): raise ValueError(f"patient must be type Patient but was type {type(patient)}") + + @staticmethod + def _check_categorizations( + categorizations: typing.Sequence[Categorization], + ) -> typing.Sequence[str]: + """ + Check that the categorizations meet the requirements. + + The requirements include: + + * the `cat_id` must be unique within the predicate + """ + issues = [] + # There must be at least one category + + # `cat_id` must be unique for a predicate! + cat_id_counts = Counter() + for c in categorizations: + cat_id_counts[c.category.cat_id] += 1 + + for cat_id, count in cat_id_counts.items(): + if count > 1: + issues.append(f'`cat_id` {cat_id} is present {count}>1 times') + + return issues diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index a8a2ba919..ff8d16e1b 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -2,11 +2,13 @@ from ._api import VariantPredicate from ._counter import AlleleCounter from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate +from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', 'boolean_predicate', 'groups_predicate', 'recessive_predicate', + 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', ] diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 3f54a7e02..601aa70bc 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,8 +1,12 @@ +import dataclasses +import enum import typing -from gpsea.model import Patient +from collections import defaultdict -from .._api import Categorization, PatientCategories +from gpsea.model import Patient, Sex + +from .._api import Categorization, PatientCategory, PatientCategories from ._api import GenotypePolyPredicate, RecessiveGroupingPredicate from ._api import VariantPredicate from ._counter import AlleleCounter @@ -30,7 +34,10 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: :attr:`AlleleCountingGenotypeBooleanPredicate.NO` or :class:`AlleleCountingGenotypeBooleanPredicate.YES` category. """ - return AlleleCountingGenotypeBooleanPredicate.YES, AlleleCountingGenotypeBooleanPredicate.NO + return ( + AlleleCountingGenotypeBooleanPredicate.YES, + AlleleCountingGenotypeBooleanPredicate.NO, + ) def get_question(self) -> str: return self._allele_counter.get_question() @@ -49,15 +56,17 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: ) def __eq__(self, value: object) -> bool: - return isinstance(value, AlleleCountingGenotypeBooleanPredicate) \ + return ( + isinstance(value, AlleleCountingGenotypeBooleanPredicate) and self._allele_counter == value._allele_counter - + ) + def __hash__(self) -> int: return hash((self._allele_counter,)) - + def __str__(self) -> str: - return f'AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})' - + return f"AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})" + def __repr__(self) -> str: return str(self) @@ -83,8 +92,8 @@ def __init__( ): self._counters = tuple(counters) self._categorizations = tuple(categorizations) - group_names = ', '.join(c.category.name for c in self._categorizations) - self._question = f'Genotype group: {group_names}' + group_names = ", ".join(c.category.name for c in self._categorizations) + self._question = f"Genotype group: {group_names}" def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -108,20 +117,29 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return None def __eq__(self, value: object) -> bool: - return isinstance(value, AlleleCountingGroupsPredicate) \ - and self._counters == value._counters \ + return ( + isinstance(value, AlleleCountingGroupsPredicate) + and self._counters == value._counters and self._categorizations == value._categorizations + ) def __hash__(self) -> int: - return hash((self._counters, self._categorizations,)) + return hash( + ( + self._counters, + self._categorizations, + ) + ) def __str__(self) -> str: return self.get_question() def __repr__(self) -> str: - return 'AlleleCountingGroupsPredicate(' \ - + 'counters={self._counters}, ' \ - + 'categorizations={self._categorizations})' + return ( + "AlleleCountingGroupsPredicate(" + + "counters={self._counters}, " + + "categorizations={self._categorizations})" + ) def groups_predicate( @@ -141,9 +159,10 @@ def groups_predicate( predicates = tuple(predicates) group_names = tuple(group_names) - assert len(predicates) >= 2, f'We need at least 2 predicates: {len(predicates)}' - assert len(predicates) == len(group_names), \ - f'The number of group names must match the number of predicates: {len(group_names)}!={len(predicates)}' + assert len(predicates) >= 2, f"We need at least 2 predicates: {len(predicates)}" + assert len(predicates) == len( + group_names + ), f"The number of group names must match the number of predicates: {len(group_names)}!={len(predicates)}" # Then, prepare the counters and categorizations. counters = [AlleleCounter(predicate=predicate) for predicate in predicates] @@ -170,7 +189,7 @@ class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate): # Therefore, I do not write any tests at this point. def __init__( - self, + self, allele_counter: AlleleCounter, ): self._allele_counter = allele_counter @@ -190,16 +209,21 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return RecessiveGroupingPredicate.BOTH else: return None - + def __eq__(self, value: object) -> bool: - return isinstance(value, AlleleCountingRecessivePredicate) and self._allele_counter == value._allele_counter - + return ( + isinstance(value, AlleleCountingRecessivePredicate) + and self._allele_counter == value._allele_counter + ) + def __hash__(self) -> int: return hash((self._allele_counter,)) - + def __str__(self) -> str: - return f'AlleleCountingRecessivePredicate(allele_counter={self._allele_counter})' - + return ( + f"AlleleCountingRecessivePredicate(allele_counter={self._allele_counter})" + ) + def __repr__(self) -> str: return str(self) @@ -209,8 +233,8 @@ def recessive_predicate( ) -> GenotypePolyPredicate: """ Create a recessive grouping predicate from given `variant_predicate` - to bin the patient into :class:`RecessiveGroupingPredicate.NEITHER`, - :class:`RecessiveGroupingPredicate.ONE`, or :class:`RecessiveGroupingPredicate.BOTH`, + to bin the patient into :class:`RecessiveGroupingPredicate.NEITHER`, + :class:`RecessiveGroupingPredicate.ONE`, or :class:`RecessiveGroupingPredicate.BOTH`, depending on the number of variant alleles matching the variant predicate. The patient is assigned into a group in the following manner: @@ -221,3 +245,347 @@ def recessive_predicate( """ allele_counter = AlleleCounter(predicate=variant_predicate) return AlleleCountingRecessivePredicate(allele_counter=allele_counter) + + +@dataclasses.dataclass(eq=True, frozen=True) +class GenotypeGroup: + allele_count: int + sex: typing.Optional[Sex] + categorization: Categorization + + +class MendelianInheritanceAspect(enum.Enum): + AUTOSOMAL = 0 + """ + Related to chromosomes that do *not* determine the sex of an individual. + """ + + GONOSOMAL = 1 + """ + Related to chromosomes that determine the sex of an individual. + """ + + MITOCHONDRIAL = 2 + """ + Related to mitochondrial DNA. + """ + + +class ModeOfInheritanceInfo: + + @staticmethod + def autosomal_dominant() -> "ModeOfInheritanceInfo": + groups = ( + GenotypeGroup( + allele_count=0, + sex=None, + categorization=Categorization( + PatientCategory( + cat_id=0, name="0/0", description="Homozygous reference" + ), + ), + ), + GenotypeGroup( + allele_count=1, + sex=None, + categorization=Categorization( + PatientCategory(cat_id=1, name="0/1", description="Heterozygous"), + ), + ), + ) + return ModeOfInheritanceInfo( + mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL, + groups=groups, + ) + + @staticmethod + def autosomal_recessive() -> "ModeOfInheritanceInfo": + groups = ( + GenotypeGroup( + allele_count=0, + sex=None, + categorization=Categorization( + PatientCategory( + cat_id=0, name="0/0", description="Homozygous reference" + ), + ), + ), + GenotypeGroup( + allele_count=1, + sex=None, + categorization=Categorization( + PatientCategory(cat_id=1, name="0/1", description="Heterozygous"), + ), + ), + GenotypeGroup( + allele_count=2, + sex=None, + categorization=Categorization( + PatientCategory(cat_id=2, name="1/1", description="Homozygous alternate"), + ), + ), + ) + return ModeOfInheritanceInfo( + mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL, + groups=groups, + ) + + @staticmethod + def x_dominant() -> "ModeOfInheritanceInfo": + groups = ( + GenotypeGroup( + allele_count=0, + sex=None, + categorization=Categorization( + PatientCategory( + cat_id=0, name="0", description="Homozygous reference" + ), + ), + ), + GenotypeGroup( + allele_count=1, + sex=Sex.FEMALE, + categorization=Categorization( + PatientCategory(cat_id=1, name="0/1", description="Heterozygous"), + ), + ), + GenotypeGroup( + allele_count=1, + sex=Sex.MALE, + categorization=Categorization( + PatientCategory(cat_id=2, name="1", description="Hemizygous"), + ), + ), + ) + return ModeOfInheritanceInfo( + mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL, + groups=groups, + ) + + @staticmethod + def x_recessive() -> "ModeOfInheritanceInfo": + groups = ( + GenotypeGroup( + allele_count=0, + sex=None, + categorization=Categorization( + PatientCategory( + cat_id=0, name="0/0", description="Homozygous reference" + ), + ), + ), + GenotypeGroup( + allele_count=1, + sex=Sex.FEMALE, + categorization=Categorization( + PatientCategory(cat_id=1, name="0/1", description="Heterozygous"), + ), + ), + GenotypeGroup( + allele_count=1, + sex=Sex.MALE, + categorization=Categorization( + PatientCategory(cat_id=2, name="1", description="Hemizygous"), + ), + ), + ) + + return ModeOfInheritanceInfo( + mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL, + groups=groups, + ) + + def __init__( + self, + mendelian_inheritance_aspect: MendelianInheritanceAspect, + groups: typing.Iterable[GenotypeGroup], + ): + # We pre-compute the hash manually. + # The correctness depends on two default dicts with same keys and values + # comparing equal. + hash_value = 17 + assert isinstance(mendelian_inheritance_aspect, MendelianInheritanceAspect) + self._aspect = mendelian_inheritance_aspect + + hash_value += 31 * hash(self._aspect) + + self._groups = defaultdict(list) + for group in groups: + assert isinstance(group, GenotypeGroup) + self._groups[group.allele_count].append(group) + hash_value += 13 * hash(group) + + self._hash = hash_value + + @property + def groups(self) -> typing.Iterator[GenotypeGroup]: + # Flatten `values()` which is an iterable of lists. + return (group for meta_group in self._groups.values() for group in meta_group) + + @property + def mendelian_inheritance_aspect(self) -> MendelianInheritanceAspect: + return self._aspect + + def get_groups_for_allele_count( + self, + allele_count: int, + ) -> typing.Sequence[GenotypeGroup]: + try: + return self._groups[allele_count] + except KeyError: + # No group for this allele count is OK + return () + + def is_autosomal(self) -> bool: + return self._aspect == MendelianInheritanceAspect.AUTOSOMAL + + def is_gonosomal(self) -> bool: + return self._aspect == MendelianInheritanceAspect.GONOSOMAL + + def is_mitochondrial(self) -> bool: + return self._aspect == MendelianInheritanceAspect.MITOCHONDRIAL + + def __eq__(self, value: object) -> bool: + return ( + isinstance(value, ModeOfInheritanceInfo) + and self._aspect == value._aspect + and self._groups == value._groups + ) + + def __hash__(self) -> int: + return self._hash + + def __str__(self) -> str: + return f"ModeOfInheritanceInfo(aspect={self._aspect}, groups={self._groups})" + + def __repr__(self) -> str: + return str(self) + + +class ModeOfInheritancePredicate(GenotypePolyPredicate): + + @staticmethod + def autosomal_dominant( + variant_predicate: VariantPredicate, + ) -> "ModeOfInheritancePredicate": + return ModeOfInheritancePredicate.from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_info=ModeOfInheritanceInfo.autosomal_dominant(), + ) + + @staticmethod + def autosomal_recessive( + variant_predicate: VariantPredicate, + ) -> "ModeOfInheritancePredicate": + return ModeOfInheritancePredicate.from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_info=ModeOfInheritanceInfo.autosomal_recessive(), + ) + + @staticmethod + def x_dominant( + variant_predicate: VariantPredicate, + ) -> "ModeOfInheritancePredicate": + return ModeOfInheritancePredicate.from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_info=ModeOfInheritanceInfo.x_dominant(), + ) + + @staticmethod + def x_recessive( + variant_predicate: VariantPredicate, + ) -> "ModeOfInheritancePredicate": + return ModeOfInheritancePredicate.from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_info=ModeOfInheritanceInfo.x_recessive(), + ) + + @staticmethod + def from_moi_info( + variant_predicate: VariantPredicate, + mode_of_inheritance_info: ModeOfInheritanceInfo, + ) -> "ModeOfInheritancePredicate": + allele_counter = AlleleCounter(predicate=variant_predicate) + return ModeOfInheritancePredicate( + allele_counter=allele_counter, + mode_of_inheritance_info=mode_of_inheritance_info, + ) + + def __init__( + self, + allele_counter: AlleleCounter, + mode_of_inheritance_info: ModeOfInheritanceInfo, + ): + assert isinstance(allele_counter, AlleleCounter) + self._allele_counter = allele_counter + + assert isinstance(mode_of_inheritance_info, ModeOfInheritanceInfo) + self._moi_info = mode_of_inheritance_info + + self._categorizations = tuple(group.categorization for group in mode_of_inheritance_info.groups) + issues = ModeOfInheritancePredicate._check_categorizations(self._categorizations) + if issues: + raise ValueError('Cannot create predicate: {}'.format(', '.join(issues))) + self._question = 'Which genotype group does the patient fit in: {}'.format( + ', '.join(cat.category.name for cat in self._categorizations), + ) + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return self._categorizations + + def get_question(self) -> str: + return self._question + + def test( + self, + patient: Patient, + ) -> typing.Optional[Categorization]: + self._check_patient(patient) + + if self._moi_info.is_autosomal(): + allele_count = self._allele_counter.count(patient) + groups = self._moi_info.get_groups_for_allele_count(allele_count) + if len(groups) == 1: + return groups[0].categorization + else: + return None + elif self._moi_info.is_gonosomal(): + if patient.sex.is_provided(): + allele_count = self._allele_counter.count(patient) + groups = self._moi_info.get_groups_for_allele_count(allele_count) + for group in groups: + if group.sex is not None and group.sex == patient.sex: + return group.categorization + return None + else: + # We must have patient's sex + # to do any meaningful analysis + # in the non-autosomal scenario. + return None + + elif self._moi_info.is_mitochondrial(): + # Cannot deal with mitochondrial inheritance right now. + return None + else: + # Bug, please report to the developers + raise ValueError("Unexpected mode of inheritance condition") + + def __eq__(self, value: object) -> bool: + return ( + isinstance(value, ModeOfInheritancePredicate) + and self._allele_counter == value._allele_counter + and self._moi_info == value._moi_info + ) + + def __hash__(self) -> int: + return hash((self._allele_counter, self._moi_info,)) + + def __str__(self) -> str: + return ( + "ModeOfInheritancePredicate(" + f"allele_counter={self._allele_counter}, " + f"moi_info={self._moi_info})" + ) + + def __repr__(self) -> str: + return str(self) diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py index 0effeab06..0d541a65d 100644 --- a/src/gpsea/model/_base.py +++ b/src/gpsea/model/_base.py @@ -26,6 +26,30 @@ class Sex(enum.Enum): Male sex. Maps to `NCIT:C46112`. """ + def is_provided(self) -> bool: + """ + Return `True` if the sex is a known value, such as `FEMALE` or `MALE`. + """ + return self != Sex.UNKNOWN_SEX + + def is_unknown(self) -> bool: + """ + Return `True` if this is an `UNKNOWN_SEX`. + """ + return self == Sex.UNKNOWN_SEX + + def is_female(self) -> bool: + """ + Return `True` if the sex represents a `FEMALE`. + """ + return self == Sex.MALE + + def is_male(self) -> bool: + """ + Return `True` if the sex represents a `MALE`. + """ + return self == Sex.MALE + class SampleLabels: """ diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py index 3043d3419..c44a214b8 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/predicate/genotype/conftest.py @@ -192,9 +192,7 @@ def patient_w_missense( sex=Sex.UNKNOWN_SEX, phenotypes=(), diseases=(), - variants=( - missense_variant, - ), + variants=(missense_variant,), ) @@ -208,7 +206,399 @@ def patient_w_frameshift( sex=Sex.UNKNOWN_SEX, phenotypes=(), diseases=(), - variants=( - frameshift_variant, + variants=(frameshift_variant,), + ) + + +""" +Genesis family - Autosomal dominant + +* Adam - father, affected +* Eve - mother, unaffected +* Cain - son, affected +""" + + +@pytest.fixture(scope="package") +def genesis_mutation( + genome_build: GenomeBuild, + adam_label: SampleLabels, + eve_label: SampleLabels, + cain_label: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=100, + end=101, + strand=Strand.POSITIVE, + ), + ref="C", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=False, + variant_effects=( + VariantEffect.MISSENSE_VARIANT, + VariantEffect.SPLICE_DONOR_VARIANT, + ), + affected_exons=(4,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(40, 41), + ), + ), + genotypes=Genotypes.from_mapping( + { + adam_label: Genotype.HETEROZYGOUS, + eve_label: Genotype.HOMOZYGOUS_REFERENCE, + cain_label: Genotype.HETEROZYGOUS, + } + ), + ) + + +@pytest.fixture(scope="package") +def adam_label() -> SampleLabels: + return SampleLabels("Adam") + + +@pytest.fixture(scope="package") +def adam( + adam_label: SampleLabels, + genesis_mutation: Variant, +) -> Patient: + return Patient( + adam_label, + sex=Sex.MALE, + phenotypes=(), + diseases=(), + variants=(genesis_mutation,), + ) + + +@pytest.fixture(scope="package") +def eve_label() -> SampleLabels: + return SampleLabels("Eve") + + +@pytest.fixture(scope="package") +def eve( + eve_label: SampleLabels, + genesis_mutation: Variant, +) -> Patient: + return Patient( + eve_label, + sex=Sex.FEMALE, + phenotypes=(), + diseases=(), + variants=(genesis_mutation,), + ) + + +@pytest.fixture(scope="package") +def cain_label() -> SampleLabels: + return SampleLabels("Cain") + + +@pytest.fixture(scope="package") +def cain( + cain_label: SampleLabels, + genesis_mutation: Variant, +) -> Patient: + return Patient( + cain_label, + sex=Sex.MALE, + phenotypes=(), + diseases=(), + variants=(genesis_mutation,), + ) + + +""" +White family - Autosomal recessive + +* Walt - father, HET +* Skyler - mother, HET +* Flynn - son, HOM_ALT +* Holly - daughter, HOM_REF +""" + + +@pytest.fixture(scope="package") +def white_mutation( + genome_build: GenomeBuild, + walt_label: SampleLabels, + skyler_label: SampleLabels, + flynn_label: SampleLabels, + holly_label: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=100, + end=101, + strand=Strand.POSITIVE, + ), + ref="C", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=False, + variant_effects=( + VariantEffect.MISSENSE_VARIANT, + VariantEffect.SPLICE_DONOR_VARIANT, + ), + affected_exons=(4,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(40, 41), + ), + ), + genotypes=Genotypes.from_mapping( + { + walt_label: Genotype.HETEROZYGOUS, + skyler_label: Genotype.HETEROZYGOUS, + flynn_label: Genotype.HOMOZYGOUS_ALTERNATE, + holly_label: Genotype.HOMOZYGOUS_REFERENCE, + } ), ) + + +@pytest.fixture(scope="package") +def walt_label() -> SampleLabels: + return SampleLabels("Walt") + + +@pytest.fixture(scope="package") +def walt( + walt_label: SampleLabels, + white_mutation: Variant, +) -> Patient: + return Patient( + walt_label, + sex=Sex.MALE, + phenotypes=(), + diseases=(), + variants=(white_mutation,), + ) + + +@pytest.fixture(scope="package") +def skyler_label() -> SampleLabels: + return SampleLabels("Skyler") + + +@pytest.fixture(scope="package") +def skyler( + skyler_label: SampleLabels, + white_mutation: Variant, +) -> Patient: + return Patient( + skyler_label, + sex=Sex.FEMALE, + phenotypes=(), + diseases=(), + variants=(white_mutation,), + ) + + +@pytest.fixture(scope="package") +def flynn_label() -> SampleLabels: + return SampleLabels("Flynn") + + +@pytest.fixture(scope="package") +def flynn( + flynn_label: SampleLabels, + white_mutation: Variant, +) -> Patient: + return Patient( + flynn_label, + sex=Sex.MALE, + phenotypes=(), + diseases=(), + variants=(white_mutation,), + ) + + +@pytest.fixture(scope="package") +def holly_label() -> SampleLabels: + return SampleLabels("Holly") + + +@pytest.fixture(scope="package") +def holly( + holly_label: SampleLabels, + white_mutation: Variant, +) -> Patient: + return Patient( + holly_label, + sex=Sex.FEMALE, + phenotypes=(), + diseases=(), + variants=(white_mutation,), + ) + + +""" +Skywalker family - X-linked recessive + +* Anakin - father, homozygous reference (possibly hemizygous reference?) +* Padme - mother, heterozygous +* Luke - son, hemizygous +* Leia - daughter, heterozygous +""" + + +@pytest.fixture(scope="package") +def skywalker_mutation( + genome_build: GenomeBuild, + anakin_label: SampleLabels, + padme_label: SampleLabels, + luke_label: SampleLabels, + leia_label: SampleLabels, +) -> Variant: + chrX = genome_build.contig_by_name("chrX") + assert chrX is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chrX, + start=100, + end=101, + strand=Strand.POSITIVE, + ), + ref="C", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=False, + variant_effects=( + VariantEffect.MISSENSE_VARIANT, + VariantEffect.SPLICE_DONOR_VARIANT, + ), + affected_exons=(4,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(40, 41), + ), + ), + genotypes=Genotypes.from_mapping( + { + anakin_label: Genotype.HOMOZYGOUS_REFERENCE, + padme_label: Genotype.HETEROZYGOUS, + luke_label: Genotype.HEMIZYGOUS, + leia_label: Genotype.HETEROZYGOUS, + } + ), + ) + + +@pytest.fixture(scope="package") +def anakin_label() -> SampleLabels: + return SampleLabels("Anakin") + + +@pytest.fixture(scope="package") +def anakin( + anakin_label: SampleLabels, + skywalker_mutation: Variant, +) -> Patient: + return Patient( + anakin_label, + sex=Sex.MALE, + phenotypes=(), + diseases=(), + variants=(skywalker_mutation,), + ) + + +@pytest.fixture(scope="package") +def padme_label() -> SampleLabels: + return SampleLabels("Padme") + + +@pytest.fixture(scope="package") +def padme( + padme_label: SampleLabels, + skywalker_mutation: Variant, +) -> Patient: + return Patient( + padme_label, + sex=Sex.FEMALE, + phenotypes=(), + diseases=(), + variants=(skywalker_mutation,), + ) + + +@pytest.fixture(scope="package") +def luke_label() -> SampleLabels: + return SampleLabels("Luke") + + +@pytest.fixture(scope="package") +def luke( + luke_label: SampleLabels, + skywalker_mutation: Variant, +) -> Patient: + return Patient( + luke_label, + sex=Sex.MALE, + phenotypes=(), + diseases=(), + variants=(skywalker_mutation,), + ) + + +@pytest.fixture(scope="package") +def leia_label() -> SampleLabels: + return SampleLabels("Leia") + + +@pytest.fixture(scope="package") +def leia( + leia_label: SampleLabels, + skywalker_mutation: Variant, +) -> Patient: + return Patient( + leia_label, + sex=Sex.FEMALE, + phenotypes=(), + diseases=(), + variants=(skywalker_mutation,), + ) + + +""" +XR family +""" diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 3e71283b4..ac6977c8e 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -5,22 +5,23 @@ GenotypePolyPredicate, groups_predicate, VariantPredicates, + VariantPredicate, + ModeOfInheritancePredicate, ) -class TestGroupsPredicate: +TX_ID = "tx:xyz" + - TX_ID = "tx:xyz" +class TestGroupsPredicate: @pytest.fixture(scope="class") def predicate(self) -> GenotypePolyPredicate: return groups_predicate( predicates=( + VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID), VariantPredicates.variant_effect( - VariantEffect.MISSENSE_VARIANT, TestGroupsPredicate.TX_ID - ), - VariantPredicates.variant_effect( - VariantEffect.FRAMESHIFT_VARIANT, TestGroupsPredicate.TX_ID + VariantEffect.FRAMESHIFT_VARIANT, TX_ID ), ), group_names=( @@ -53,7 +54,7 @@ def test_get_categorizations( def test_test__missense( self, - patient_w_missense: Variant, + patient_w_missense: Patient, predicate: GenotypePolyPredicate, ): cat = predicate.test(patient_w_missense) @@ -65,7 +66,7 @@ def test_test__missense( def test_test__frameshift( self, - patient_w_frameshift: Variant, + patient_w_frameshift: Patient, predicate: GenotypePolyPredicate, ): cat = predicate.test(patient_w_frameshift) @@ -74,3 +75,90 @@ def test_test__frameshift( assert cat.category.cat_id == 1 assert cat.category.name == "LoF" assert cat.category.description == "FRAMESHIFT_VARIANT on tx:xyz" + + +class TestModeOfInheritancePredicate: + + @pytest.fixture(scope="class") + def variant_predicate(self) -> VariantPredicate: + return VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + + @pytest.mark.parametrize( + "patient_name,cat_id,name", + [ + ("adam", 1, "0/1"), + ("eve", 0, "0/0"), + ("cain", 1, "0/1"), + ], + ) + def test_autosomal_dominant( + self, + patient_name: str, + cat_id: int, + name: str, + variant_predicate: VariantPredicate, + request: pytest.FixtureRequest, + ): + patient = request.getfixturevalue(patient_name) + predicate = ModeOfInheritancePredicate.autosomal_dominant(variant_predicate) + + categorization = predicate.test(patient) + + assert categorization is not None + + assert categorization.category.cat_id == cat_id + assert categorization.category.name == name + + @pytest.mark.parametrize( + "patient_name,cat_id,name", + [ + ("walt", 1, "0/1"), + ("skyler", 1, "0/1"), + ("flynn", 2, "1/1"), + ("holly", 0, "0/0"), + ], + ) + def test_autosomal_recessive( + self, + patient_name: str, + cat_id: int, + name: str, + variant_predicate: VariantPredicate, + request: pytest.FixtureRequest, + ): + patient = request.getfixturevalue(patient_name) + predicate = ModeOfInheritancePredicate.autosomal_recessive(variant_predicate) + + categorization = predicate.test(patient) + + assert categorization is not None + + assert categorization.category.cat_id == cat_id + assert categorization.category.name == name + + @pytest.mark.parametrize( + "patient_name,cat_id,name", + [ + ("anakin", 0, "0/0"), + ("padme", 1, "0/1"), + ("luke", 2, "1"), + ("leia", 1, "0/1"), + ], + ) + def test_x_recessive( + self, + patient_name: str, + cat_id: int, + name: str, + variant_predicate: VariantPredicate, + request: pytest.FixtureRequest, + ): + patient = request.getfixturevalue(patient_name) + predicate = ModeOfInheritancePredicate.x_recessive(variant_predicate) + + categorization = predicate.test(patient) + + assert categorization is not None + + assert categorization.category.cat_id == cat_id + assert categorization.category.name == name From ecf589c2220bb2e14d56806c1f8fbff0c0701eb5 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 30 Aug 2024 10:28:48 +0200 Subject: [PATCH 05/16] Move `DeVriesScorer` test elsewhere. --- tests/analysis/{ => pscore}/test_de_vries_scorer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) rename tests/analysis/{ => pscore}/test_de_vries_scorer.py (97%) diff --git a/tests/analysis/test_de_vries_scorer.py b/tests/analysis/pscore/test_de_vries_scorer.py similarity index 97% rename from tests/analysis/test_de_vries_scorer.py rename to tests/analysis/pscore/test_de_vries_scorer.py index 220dd18b0..89679a37f 100644 --- a/tests/analysis/test_de_vries_scorer.py +++ b/tests/analysis/pscore/test_de_vries_scorer.py @@ -4,7 +4,7 @@ import pytest from gpsea.analysis.pscore import DeVriesPhenotypeScorer -from gpsea.model import Patient, SampleLabels, Phenotype +from gpsea.model import Patient, SampleLabels, Phenotype, Sex intrauterine_growth_retardation = 'HP:0001511' small_for_gestational_age = 'HP:0001518' @@ -69,6 +69,7 @@ def test_a_patient( ): patient = Patient( labels=SampleLabels("test"), + sex=Sex.UNKNOWN_SEX, phenotypes=( Phenotype.from_raw_parts( term_id=curie, From 0eb40a6d711dd1dfd16ab028441a5b432b78de43 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 30 Aug 2024 10:32:55 +0200 Subject: [PATCH 06/16] Fine tune the inheritance mode assignment. --- .../analysis/predicate/genotype/_gt_predicates.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 601aa70bc..1cd1a6577 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -553,9 +553,17 @@ def test( if patient.sex.is_provided(): allele_count = self._allele_counter.count(patient) groups = self._moi_info.get_groups_for_allele_count(allele_count) - for group in groups: - if group.sex is not None and group.sex == patient.sex: - return group.categorization + if len(groups) == 0: + # Unable to assign the individual. + return None + elif len(groups) == 1: + # We can only assign into one category no matter what the individual's sex is. + return groups[0].categorization + else: + # We choose depending on the sex. + for group in groups: + if group.sex is not None and group.sex == patient.sex: + return group.categorization return None else: # We must have patient's sex From 29d126551ecc3b50f47d9a99a9d4cc40fdc3a9e3 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 30 Aug 2024 16:00:53 +0200 Subject: [PATCH 07/16] Improve documentation for predicates. --- docs/report/tbx5_frameshift_vs_missense.csv | 244 +------------- docs/tutorial.rst | 6 +- docs/user-guide/predicates.rst | 315 ++++++++++++++++-- docs/user-guide/report/tbx5_frameshift.csv | 282 ++-------------- docs/user-guide/stats.rst | 45 ++- .../predicate/genotype/_gt_predicates.py | 89 +++-- src/gpsea/model/_base.py | 6 +- tests/analysis/predicate/genotype/conftest.py | 15 +- .../predicate/genotype/test_gt_predicates.py | 58 ++-- 9 files changed, 445 insertions(+), 615 deletions(-) diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 421e69272..07245a440 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -16,247 +16,7 @@ Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.9520604334894502,0.77 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 -Abnormal carpal morphology [HP:0001191],30/32,94%,0/0,0%,, -Abnormal hand morphology [HP:0005922],53/53,100%,20/20,100%,, -Abnormality of the hand [HP:0001155],60/60,100%,31/31,100%,, +Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,, +Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,, Abnormality of the upper limb [HP:0002817],73/73,100%,34/34,100%,, Abnormality of limbs [HP:0040064],73/73,100%,34/34,100%,, -Phenotypic abnormality [HP:0000118],82/82,100%,38/38,100%,, -All [HP:0000001],82/82,100%,38/38,100%,, -Abnormality of the wrist [HP:0003019],30/30,100%,0/0,0%,, -Abnormality of upper limb joint [HP:0009810],30/30,100%,6/6,100%,, -Abnormal joint morphology [HP:0001367],31/31,100%,6/6,100%,, -Abnormal skeletal morphology [HP:0011842],73/73,100%,35/35,100%,, -Abnormality of the skeletal system [HP:0000924],73/73,100%,35/35,100%,, -Abnormality of the musculoskeletal system [HP:0033127],74/74,100%,35/35,100%,, -Short thumb [HP:0009778],11/41,27%,8/30,27%,, -Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,, -Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,, -Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,, -Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],55/55,100%,22/22,100%,, -Aplasia/hypoplasia involving bones of the extremities [HP:0045060],55/55,100%,22/22,100%,, -Aplasia/hypoplasia of the extremities [HP:0009815],55/55,100%,22/22,100%,, -Aplasia/hypoplasia involving the skeleton [HP:0009115],56/56,100%,23/23,100%,, -Abnormal limb bone morphology [HP:0002813],63/63,100%,34/34,100%,, -Abnormality of limb bone [HP:0040068],63/63,100%,34/34,100%,, -Abnormal appendicular skeleton morphology [HP:0011844],64/64,100%,34/34,100%,, -Abnormal finger morphology [HP:0001167],36/36,100%,31/31,100%,, -Abnormal digit morphology [HP:0011297],38/38,100%,33/33,100%,, -Abnormal thumb morphology [HP:0001172],30/30,100%,31/31,100%,, -Short finger [HP:0009381],11/11,100%,8/8,100%,, -Short digit [HP:0011927],11/11,100%,10/10,100%,, -Complete atrioventricular canal defect [HP:0001674],5/37,14%,3/36,8%,, -Atrioventricular canal defect [HP:0006695],5/5,100%,3/3,100%,, -Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,, -Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, -Abnormal cardiovascular system morphology [HP:0030680],63/63,100%,30/30,100%,, -Abnormality of the cardiovascular system [HP:0001626],65/65,100%,32/32,100%,, -11 pairs of ribs [HP:0000878],1/1,100%,0/0,0%,, -Missing ribs [HP:0000921],1/1,100%,0/0,0%,, -Aplasia/Hypoplasia of the ribs [HP:0006712],1/1,100%,0/0,0%,, -Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],1/1,100%,2/2,100%,, -Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,2/2,100%,, -Abnormal axial skeleton morphology [HP:0009121],8/8,100%,5/5,100%,, -Abnormal thorax morphology [HP:0000765],6/6,100%,5/5,100%,, -Abnormal rib morphology [HP:0000772],1/1,100%,0/0,0%,, -Abnormal rib cage morphology [HP:0001547],4/4,100%,0/0,0%,, -Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,, -Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,, -Abnormal cardiac atrium morphology [HP:0005120],43/43,100%,20/20,100%,, -Finger aplasia [HP:0009380],15/15,100%,14/14,100%,, -Aplasia involving forearm bones [HP:0009822],7/7,100%,6/6,100%,, -Aplasia/hypoplasia involving forearm bones [HP:0006503],37/37,100%,12/12,100%,, -Abnormal forearm bone morphology [HP:0040072],37/37,100%,14/14,100%,, -Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,, -Abnormal forearm morphology [HP:0002973],37/37,100%,14/14,100%,, -Aplasia/Hypoplasia of the radius [HP:0006501],37/37,100%,11/11,100%,, -Abnormal morphology of the radius [HP:0002818],37/37,100%,13/13,100%,, -Absent forearm bone [HP:0003953],7/7,100%,6/6,100%,, -Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,, -Upper limb undergrowth [HP:0009824],33/33,100%,7/7,100%,, -Limb undergrowth [HP:0009826],33/33,100%,7/7,100%,, -Short long bone [HP:0003026],35/35,100%,9/9,100%,, -Abnormal long bone morphology [HP:0011314],44/44,100%,13/13,100%,, -Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,, -Abnormality of thumb phalanx [HP:0009602],13/13,100%,13/13,100%,, -Limited pronation/supination of forearm [HP:0006394],0/0,0%,3/3,100%,, -Limited elbow movement [HP:0002996],0/0,0%,4/4,100%,, -Abnormality of the elbow [HP:0009811],0/0,0%,5/5,100%,, -Limitation of joint mobility [HP:0001376],0/0,0%,4/4,100%,, -Abnormality of joint mobility [HP:0011729],1/1,100%,5/5,100%,, -Abnormal joint physiology [HP:0034430],1/1,100%,5/5,100%,, -Abnormal musculoskeletal physiology [HP:0011843],1/1,100%,5/5,100%,, -Abnormality of cardiovascular system electrophysiology [HP:0030956],15/15,100%,3/3,100%,, -Abnormal cardiovascular system physiology [HP:0011025],23/23,100%,5/5,100%,, -Pre-capillary pulmonary hypertension [HP:0033578],4/4,100%,0/0,0%,, -Elevated pulmonary artery pressure [HP:0004890],4/4,100%,0/0,0%,, -Abnormality of pulmonary circulation [HP:0030875],4/4,100%,0/0,0%,, -Abnormal vascular physiology [HP:0030163],4/4,100%,0/0,0%,, -Abnormality of the vasculature [HP:0002597],10/10,100%,2/2,100%,, -Abnormal respiratory system physiology [HP:0002795],4/4,100%,0/0,0%,, -Abnormality of the respiratory system [HP:0002086],4/4,100%,0/0,0%,, -Tricuspid regurgitation [HP:0005180],3/3,100%,0/0,0%,, -Atrioventricular valve regurgitation [HP:0034376],4/4,100%,2/2,100%,, -Abnormal atrioventricular valve physiology [HP:0031650],4/4,100%,2/2,100%,, -Abnormal heart valve physiology [HP:0031653],4/4,100%,2/2,100%,, -Abnormal tricuspid valve physiology [HP:0031651],3/3,100%,0/0,0%,, -Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,0/0,0%,, -Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,0/0,0%,, -Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,0/0,0%,, -Abnormal finger phalanx morphology [HP:0005918],3/3,100%,0/0,0%,, -Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,0/0,0%,, -Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,0/0,0%,, -Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,0/0,0%,, -Aplasia involving bones of the extremities [HP:0009825],0/0,0%,0/0,0%,, -Aplasia of metacarpal bones [HP:0010048],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,0/0,0%,, -Abnormal metacarpal morphology [HP:0005916],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,0/0,0%,, -Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,0/0,0%,, -Partial absence of thumb [HP:0009659],0/0,0%,0/0,0%,, -1-2 finger syndactyly [HP:0010704],3/3,100%,1/1,100%,, -Finger syndactyly [HP:0006101],4/4,100%,2/2,100%,, -Syndactyly [HP:0001159],4/4,100%,2/2,100%,, -Upper limb phocomelia [HP:0009813],8/85,9%,2/37,5%,, -Phocomelia [HP:0009829],8/8,100%,2/2,100%,, -Clinodactyly of the 5th finger [HP:0004209],1/1,100%,0/0,0%,, -Finger clinodactyly [HP:0040019],1/1,100%,0/0,0%,, -Clinodactyly [HP:0030084],1/1,100%,0/0,0%,, -Deviation of finger [HP:0004097],1/1,100%,2/2,100%,, -Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,2/2,100%,, -Deviation of the 5th finger [HP:0009179],1/1,100%,0/0,0%,, -Abnormal 5th finger morphology [HP:0004207],4/4,100%,0/0,0%,, -Patent foramen ovale [HP:0001655],4/40,10%,0/36,0%,, -Small hypothenar eminence [HP:0010487],2/2,100%,0/0,0%,, -Abnormality of the hypothenar eminence [HP:0010486],2/2,100%,0/0,0%,, -Abnormality of the musculature of the hand [HP:0001421],2/2,100%,0/0,0%,, -Abnormality of the musculature of the upper limbs [HP:0001446],2/2,100%,0/0,0%,, -Abnormality of the musculature of the limbs [HP:0009127],2/2,100%,0/0,0%,, -Abnormal skeletal muscle morphology [HP:0011805],2/2,100%,0/0,0%,, -Abnormality of the musculature [HP:0003011],2/2,100%,0/0,0%,, -Perimembranous ventricular septal defect [HP:0011682],3/59,5%,3/25,12%,, -Deviation of the thumb [HP:0009603],0/0,0%,2/2,100%,, -Aplasia/hypoplasia of the humerus [HP:0006507],7/7,100%,4/4,100%,, -Abnormality of the humerus [HP:0003063],7/7,100%,4/4,100%,, -Abnormality of the upper arm [HP:0001454],7/7,100%,4/4,100%,, -Right atrial enlargement [HP:0030718],4/4,100%,0/0,0%,, -Abnormal right atrium morphology [HP:0025580],4/4,100%,0/0,0%,, -Atrial septal dilatation [HP:0011995],4/4,100%,0/0,0%,, -Pectus excavatum [HP:0000767],3/4,75%,2/2,100%,, -Abnormal sternum morphology [HP:0000766],3/3,100%,2/2,100%,, -Postaxial hand polydactyly [HP:0001162],3/4,75%,0/0,0%,, -Postaxial polydactyly [HP:0100259],3/3,100%,0/0,0%,, -Polydactyly [HP:0010442],3/3,100%,0/0,0%,, -Hand polydactyly [HP:0001161],3/3,100%,0/0,0%,, -Duplication of phalanx of hand [HP:0009997],3/3,100%,0/0,0%,, -Duplication of hand bones [HP:0004275],3/3,100%,0/0,0%,, -Duplication of bones involving the upper extremities [HP:0009142],3/3,100%,0/0,0%,, -High palate [HP:0000218],3/3,100%,0/0,0%,, -Abnormal palate morphology [HP:0000174],5/5,100%,0/0,0%,, -Abnormal oral cavity morphology [HP:0000163],5/5,100%,1/1,100%,, -Abnormal oral morphology [HP:0031816],5/5,100%,1/1,100%,, -Abnormality of the mouth [HP:0000153],5/5,100%,1/1,100%,, -Abnormality of the face [HP:0000271],5/5,100%,1/1,100%,, -Abnormality of the head [HP:0000234],5/5,100%,2/2,100%,, -Abnormality of head or neck [HP:0000152],5/5,100%,2/2,100%,, -Short neck [HP:0000470],3/3,100%,0/0,0%,, -Abnormal neck morphology [HP:0025668],3/3,100%,0/0,0%,, -Abnormality of the neck [HP:0000464],3/3,100%,0/0,0%,, -Abnormality of the cervical spine [HP:0003319],3/3,100%,0/0,0%,, -Abnormality of the vertebral column [HP:0000925],4/4,100%,1/1,100%,, -Shield chest [HP:0000914],3/3,100%,0/0,0%,, -Enlarged thorax [HP:0100625],3/3,100%,0/0,0%,, -Y-shaped metatarsals [HP:0010567],3/3,100%,0/0,0%,, -Abnormal metatarsal morphology [HP:0001832],3/3,100%,0/0,0%,, -Abnormal lower limb bone morphology [HP:0040069],3/3,100%,0/0,0%,, -Abnormality of the lower limb [HP:0002814],3/3,100%,0/0,0%,, -Abnormal foot morphology [HP:0001760],3/3,100%,0/0,0%,, -Mitral regurgitation [HP:0001653],1/1,100%,2/2,100%,, -Abnormal mitral valve physiology [HP:0031481],1/1,100%,2/2,100%,, -First degree atrioventricular block [HP:0011705],0/22,0%,1/1,100%,, -Congenital malformation of the great arteries [HP:0011603],4/4,100%,2/2,100%,, -Abnormal morphology of the great vessels [HP:0030962],6/6,100%,2/2,100%,, -Abnormal blood vessel morphology [HP:0033353],6/6,100%,2/2,100%,, -Abnormal vascular morphology [HP:0025015],6/6,100%,2/2,100%,, -Abnormal skull morphology [HP:0000929],1/1,100%,2/2,100%,, -Sinus bradycardia [HP:0001688],0/0,0%,1/1,100%,, -Abnormal electrophysiology of sinoatrial node origin [HP:0011702],0/0,0%,1/1,100%,, -Arrhythmia [HP:0011675],1/1,100%,1/1,100%,, -Bradycardia [HP:0001662],0/0,0%,1/1,100%,, -Abnormal skin morphology [HP:0011121],0/0,0%,1/1,100%,, -Abnormality of the skin [HP:0000951],0/0,0%,1/1,100%,, -Abnormality of the integument [HP:0001574],0/0,0%,1/1,100%,, -Sinus venosus atrial septal defect [HP:0011567],0/2,0%,1/1,100%,, -Abnormal morphology of ulna [HP:0040071],2/2,100%,4/4,100%,, -Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,2/2,100%,, -Persistent left superior vena cava [HP:0005301],2/37,5%,0/0,0%,, -Abnormal superior vena cava morphology [HP:0025575],2/2,100%,0/0,0%,, -Abnormal vena cava morphology [HP:0005345],2/2,100%,0/0,0%,, -Abnormal venous morphology [HP:0002624],2/2,100%,0/0,0%,, -Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,, -Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,0/0,0%,, -Shoulder muscle hypoplasia [HP:0008952],0/0,0%,0/0,0%,, -Hypoplasia of the musculature [HP:0009004],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,0/0,0%,, -Abnormality of muscle size [HP:0030236],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,0/0,0%,, -Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,0/0,0%,, -Common atrium [HP:0011565],0/83,0%,0/38,0%,, -Unroofed coronary sinus [HP:0031297],0/85,0%,0/38,0%,, -Atrioventricular dissociation [HP:0011709],0/22,0%,1/1,100%,, -Bowed forearm bones [HP:0003956],0/0,0%,1/1,100%,, -Bowing of the arm [HP:0006488],0/0,0%,1/1,100%,, -Bowing of the long bones [HP:0006487],0/0,0%,1/1,100%,, -Abnormal diaphysis morphology [HP:0000940],0/0,0%,1/1,100%,, -Proximal placement of thumb [HP:0009623],0/0,0%,0/0,0%,, -Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,0/0,0%,, -Type A brachydactyly [HP:0009370],0/0,0%,0/0,0%,, -Brachydactyly [HP:0001156],0/0,0%,0/0,0%,, -Short 5th finger [HP:0009237],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,0/0,0%,, -Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,0/0,0%,, -Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,0/0,0%,, -Short middle phalanx of finger [HP:0005819],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,0/0,0%,, -Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,0/0,0%,, -Short phalanx of finger [HP:0009803],0/0,0%,0/0,0%,, -Micrognathia [HP:0000347],1/1,100%,1/1,100%,, -Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,, -Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,, -Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,, -Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,, -Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,, -Cleft soft palate [HP:0000185],2/2,100%,0/0,0%,, -Abnormal soft palate morphology [HP:0100736],2/2,100%,0/0,0%,, -Cleft palate [HP:0000175],2/2,100%,0/0,0%,, -Orofacial cleft [HP:0000202],2/2,100%,0/0,0%,, -Craniofacial cleft [HP:5201015],2/2,100%,0/0,0%,, -Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,, -Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,, -Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,, -Amelia involving the upper limbs [HP:0009812],0/83,0%,1/37,3%,, -Third degree atrioventricular block [HP:0001709],0/22,0%,1/1,100%,, -Upper extremity joint dislocation [HP:0030310],0/0,0%,2/2,100%,, -Joint dislocation [HP:0001373],0/0,0%,2/2,100%,, -Abnormal toe morphology [HP:0001780],0/0,0%,0/0,0%,, -Abnormal toe phalanx morphology [HP:0010161],0/0,0%,0/0,0%,, -Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,0/0,0%,, -Aplasia/Hypoplasia of the 2nd finger [HP:0006264],1/1,100%,0/0,0%,, -Abnormal 2nd finger morphology [HP:0004100],1/1,100%,0/0,0%,, -Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,, -Left ventricular noncompaction cardiomyopathy [HP:0011664],0/1,0%,1/5,20%,, -Noncompaction cardiomyopathy [HP:0012817],0/0,0%,1/1,100%,, -Cardiomyopathy [HP:0001638],0/0,0%,1/1,100%,, -Abnormal myocardium morphology [HP:0001637],0/0,0%,1/1,100%,, -Mitral valve prolapse [HP:0001634],0/0,0%,1/1,100%,, -Abnormal mitral valve morphology [HP:0001633],0/0,0%,1/1,100%,, -Abnormal atrioventricular valve morphology [HP:0006705],0/0,0%,1/1,100%,, -Abnormal heart valve morphology [HP:0001654],0/0,0%,1/1,100%,, -Short 1st metacarpal [HP:0010034],0/30,0%,0/22,0%,, -Short phalanx of the thumb [HP:0009660],0/30,0%,0/22,0%,, diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 10ce6a23a..8d1d47268 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -266,11 +266,11 @@ by exploring the phenotype MTC filtering report. .. raw:: html :file: report/tbx5_frameshift_vs_missense.mtc_report.html -and these are the HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: +and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: >>> from gpsea.analysis.predicate import PatientCategories >>> summary_df = result.summarize(hpo, PatientCategories.YES) ->>> summary_df.to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP +>>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense :file: report/tbx5_frameshift_vs_missense.csv @@ -283,4 +283,4 @@ was observed in 31/60 (52%) patients with a missense variant but it was observed in 19/19 (100%) patients with a frameshift variant. Fisher exact test computed a p value of `~0.0000562` and the p value corrected by Benjamini-Hochberg procedure -is `~0.00112`. +is `~0.000899`. diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index 00647f1f2..908afa147 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -4,23 +4,80 @@ Predicates ========== -GPSEA uses predicates to test if variant, patient, or any tested item -meets a condition. Based on the test results, the items are assigned into groups. +Searching for genotype-phenotype associations usually requires to partition +the individuals into several discrete groups to allow testing for the inter-group differences. +GPSEA reflects these requirements with its predicate API. +Perhaps unsurprisingly, a predicate must be capable of partitioning the individuals into two or more groups. +The groups must be *exclusive* - each individual must be assigned at most into one group. +Moreover, the groups should be *exhaustive* and cover maximum of the possible states. +However, the predicate is allowed to return `None` if the individual cannot be assigned. +In result, the individual will be omitted from the downstream analysis. + +Predicates serve both *genotype* and *phenotype* prongs of the analysis. +Genotype predicates (:class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`) +assign the :class:`~gpsea.model.Patient` +into a group (mostly) based on the variant information, while the +phenotype predicates (:class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate`) +use the HPO terms to assign a group. + +It is literally impossible to use GPSEA without the predicates +because all analyses need at least one predicate (typically a *genotype* predicate). +Luckily, the purpose of this guide is to show all that is to know about predicates. +We will first discuss the genotype predicates and end with phenotype predicates. + +.. _genotype-predicates: + +******************* +Genotype predicates +******************* + +A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes. +Typically, this includes using the genotype data, such as presence of a missense variant +in a heterozygous genotype. However, other categorical variables, +such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used. + +The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria. +The testing is done in two steps. First, we count the alleles +of the matching variants and then we interpret the count, possibly including factors +such as the expected mode of inheritance and sex, to assign the individual into a group. +Finding the matching variants is what +the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. + + +TODO: wordsmith +We must first create the variant predicate and then wrap it in genotype predicate. -As described in the :class:`~gpsea.analysis.predicate.PolyPredicate` API, -the groups must be *exclusive* - the item can be assigned with one and only one group, -and *exhaustive* - the groups must cover all possible scenarios. -However, if the item cannot be assigned into any meaningful category, -the predicate can return `None`, and the item will be omitted from the analysis. +Variant predicates +================== -The predicates can be chained to test for more complex conditions. -For instance, "test if a patient has a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`". +GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class +to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria. +The variant predicate can leverage multiple primary data: -Let's demonstrate this on an example with a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. -We will load a cohort of 5 subjects with variants in *ANKRD11*, leading to KBG syndrome. -The the clinical signs and symptoms of the subjects were encoded into HPO terms -along with the pathogenic *ANKRD11* variant. ++------------------------+-------------------------------------------------------------------------------------------------+ +| Primary data source | Example | ++========================+=================================================================================================+ +| Allele | the variant being a deletion or a single nucleotide variant (SNV) | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Genome | overlaps of a target genomic region | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Functional annotation | variant is predicted to lead to a missense change or affect an exon of certain transcript | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Protein data | variant is located in a region encoding a protein domain, protein feature type | ++------------------------+-------------------------------------------------------------------------------------------------+ + +which demands a considerable amount of flexibility for creating the predicate. + +As a rule of thumb, the predicates for testing basic conditions are available off the shelf, +and they can be used as building block for testing for more complex conditions, +such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`". + +Let's demonstrate this on few examples. +We will load a cohort of 19 subjects with variants in *RERE*, +leading to `Holt-Oram syndrome MIM:142900 `_. +The the clinical signs and symptoms of the subjects were encoded into HPO terms +along with the pathogenic *RERE* variant. Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section. Briefly, we first load HPO: @@ -36,7 +93,7 @@ then, we configure the cohort creator: which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets from the release `0.1.18` of `Phenopacket Store `_. -This time, however, we will load 19 individuals with mutations in *RERE* gene: +We will load 19 individuals with mutations in *RERE* gene: >>> from ppktstore.registry import configure_phenopacket_registry >>> registry = configure_phenopacket_registry() @@ -58,8 +115,8 @@ that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with >>> variant_key_of_interest = '1_8358231_8358231_T_C' >>> variant = cohort.get_variant_by_key(variant_key_of_interest) -Simple predicates -***************** +Building blocks +--------------- We can check that the variant overlaps with *RERE*: @@ -92,10 +149,10 @@ See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` for more info on the predicates available off the shelf. -Compound predicates -******************* +Complex conditions +------------------ -The simple predicates can be combined to test for more elaborate conditions. +We can combine the building blocks to test for more elaborate conditions. For instance, we can test if the variant meets *any* or several conditions: >>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) @@ -121,8 +178,8 @@ such as testing if the variant is a *"chromosomal deletion" or a deletion which '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))' -Inverted predicate -****************** +Inverting conditions +-------------------- Sometimes we may want to test the variant for a condition that must *not* be met. For instance, we may want to test if the variant is a deletion @@ -151,9 +208,219 @@ This is how we can use the predicate inversion to build the predicate for non-fr Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question. +The variant predicate offers a flexible API for testing if variants meet a condition. +However, the genotype phenotype correlations are done on the individual level +and the variant predicates are used as a component of the genotype predicate. +The next sections show how to use variant predicates to assign individuals into groups. + + +Mode of inheritance predicate +============================= + +The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` +assigns the individual into a group based on the number of alleles +that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. +The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports +the following Mendelian modes of inheritance (MoI): + + ++-----------------------+-----------------------------------+------------------+------------------------+ +| Mode of inheritance | Sex | Allele count | Genotype category | ++=======================+===================================+==================+========================+ +| Autosomal dominant | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| Autosomal recessive | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 2 | `BIALLELIC_ALT` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 3` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| X-linked dominant | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| X-linked recessive | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` | ++ + +------------------+------------------------+ +| | | 2 | `BIALLELIC_ALT` | ++ + +------------------+------------------------+ +| | | :math:`\ge 3` | ``None`` | ++ +-----------------------------------+------------------+------------------------+ +| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` | ++ + +------------------+------------------------+ +| | | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ + +.. note:: + + `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. + +Clinical judgment should be used to choose the MoI for the cohort analysis. +Then a predicate for the desired MoI can be created by one of +:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: + +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive` + +All constructors take an instance +of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument. + + +Example +------- + +Here we show seting up a predicate for grouping individuals based on +having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5`` +to test differences between the genotypes of a disease with an autosomal recessive MoI. + +First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +for testing if a variant meets the condition: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> tx_id = 'NM_1234.5' +>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ +... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) +>>> is_frameshift_or_stop_gain.get_question() +'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' + +Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +for assigning a patient into a genotype group: + +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate.get_question() +'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' + +The `gt_predicate` can be used in downstream analysis, such as in :class: + + +Groups predicate +================ + +Sometimes, all we want is to compare if there is a difference between individuals +who include one or more alleles of variant $X$ vs. individuals with variants $Y$, +vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates. +We can do this with a *groups* predicate. + +The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` +takes *n* variant predicates and *n* group labels, and it will assign the patients +into the respective groups if one or more matching allele is found. +However, only one predicate is allowed to return a non-zero allele count. +Otherwise, the patient is assigned with ``None`` and excluded from the analysis. + +Example +------- + +Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +for testing if the individual has at least one missense vs. frameshift vs. synonymous variant. + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate +>>> tx_id = 'NM_1234.5' +>>> gt_predicate = groups_predicate( +... predicates=( +... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id), +... ), +... group_names=('Missense', 'Frameshift', 'Synonymous'), +... ) +>>> gt_predicate.get_question() +'Genotype group: Missense, Frameshift, Synonymous' + + +.. _phenotype-predicates: + +******************** +Phenotype predicates +******************** + +The phenotype predicate assigns the individual into a group with respect to tested phenotype. +Typically, the phenotype corresponds to a clinical sign or symptom encoded into an HPO term. + + +Propagating phenotype predicate +=============================== + +When testing for presence or absence of an HPO term, the propagating phenotype predicate +leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. +In result, an individual annotated with a term is implicitly annotated with all its ancestors. +For instance, an individual annotated with `Ectopia lentis `_ +is also annotated with `Abnormal lens morphology `_, +`Abnormal anterior eye segment morphology `_, +`Abnormal eye morphology `_, ... + +Similarly, all descendants of a term, whose presence was specifically excluded in an individual, +are implicitly excluded. + +:class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` implements this logic. + +Example +------- + +Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` +to test for a presence of `Abnormal lens morphology `_. + + +>>> from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate +>>> query = hpotk.TermId.from_curie('HP:0000517') +>>> pheno_predicate = PropagatingPhenotypePredicate( +... hpo=hpo, +... query=query, +... ) +>>> pheno_predicate.get_question() +'Is Abnormal lens morphology present in the patient?' + + +TODO: explain ``missing_implies_phenotype_excluded`` + + +Predicates for all cohort phenotypes +==================================== + +Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious. +The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` +function cuts down the tedium: + +>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest +>>> pheno_predicates = prepare_predicates_for_terms_of_interest( +... cohort=cohort, +... hpo=hpo, +... ) +>>> len(pheno_predicates) +301 + +and prepares predicates for testing 301 HPO terms of the *RERE* cohort. + + +******* +Gallery +******* + +Here we show examples of predicates used in some of our analyses. + +TODO + + +********** +Need more? +********** -That's it for predicates. Please see :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` +Please see :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` and :class:`~gpsea.analysis.predicate.genotype.ProteinPredicates` -for a comprehensive list of the predicates available off the shelf. +for a list of the predicates available off the shelf. -Please open an issue on our `GitHub tracker `_ if a predicate seems to be missing. +However, feel free to open an issue on our `GitHub tracker `_ +if a predicate seems to be missing. diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index 6abe64be2..a96700cfb 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,262 +1,22 @@ -FRAMESHIFT_VARIANT on NM_181486.4,Yes,Yes,No,No,, +"Which genotype group does the patient fit in: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],19/19,100%,42/71,59%,0.004112753923262261,0.00024192670136836828 -Abnormal atrioventricular conduction [HP:0005150],3/3,100%,1/23,4%,0.013076923076923076,0.0015384615384615385 -Absent thumb [HP:0009777],14/31,45%,18/100,18%,0.021044138590779585,0.0037136715160199273 -Atrioventricular block [HP:0001678],2/2,100%,1/23,4%,0.034,0.010000000000000002 -Heart block [HP:0012722],2/2,100%,1/23,4%,0.034,0.010000000000000002 -Patent ductus arteriosus [HP:0001643],2/2,100%,6/40,15%,0.09214092140921408,0.03252032520325203 -Secundum atrial septal defect [HP:0001684],4/22,18%,23/55,42%,0.1440020479198931,0.06544319142266644 -Triphalangeal thumb [HP:0001199],13/32,41%,23/99,23%,0.1440020479198931,0.06932119159387057 -Cardiac conduction abnormality [HP:0031546],3/3,100%,15/37,41%,0.1440020479198931,0.08259109311740892 -Muscular ventricular septal defect [HP:0011623],6/25,24%,8/84,10%,0.1440020479198931,0.08470708701170182 -Pulmonary arterial hypertension [HP:0002092],0/2,0%,8/14,57%,0.6899307928951144,0.4666666666666667 -Short thumb [HP:0009778],8/30,27%,25/69,36%,0.6899307928951144,0.4870099714553749 -Absent radius [HP:0003974],6/25,24%,9/43,21%,1.0,0.7703831604944444 -Hypoplasia of the radius [HP:0002984],6/14,43%,34/75,45%,1.0,1.0 -Atrial septal defect [HP:0001631],20/20,100%,63/65,97%,1.0,1.0 -Hypoplasia of the ulna [HP:0003022],2/10,20%,3/17,18%,1.0,1.0 -Short humerus [HP:0005792],4/9,44%,8/21,38%,1.0,1.0 -Abnormal ventricular septum morphology [HP:0010438],19/19,100%,42/42,100%,, -Abnormal cardiac ventricle morphology [HP:0001713],19/19,100%,43/43,100%,, -Abnormal heart morphology [HP:0001627],30/30,100%,89/89,100%,, -Abnormal cardiovascular system morphology [HP:0030680],30/30,100%,92/92,100%,, -Abnormality of the cardiovascular system [HP:0001626],32/32,100%,94/94,100%,, -Phenotypic abnormality [HP:0000118],38/38,100%,114/114,100%,, -All [HP:0000001],38/38,100%,114/114,100%,, -Abnormal cardiac septum morphology [HP:0001671],28/28,100%,89/89,100%,, -Forearm undergrowth [HP:0009821],7/7,100%,35/35,100%,, -Abnormal upper limb bone morphology [HP:0040070],14/14,100%,50/50,100%,, -Abnormality of the upper limb [HP:0002817],34/34,100%,102/102,100%,, -Abnormality of limbs [HP:0040064],34/34,100%,102/102,100%,, -Abnormal limb bone morphology [HP:0002813],34/34,100%,92/92,100%,, -Abnormality of limb bone [HP:0040068],34/34,100%,92/92,100%,, -Abnormality of the skeletal system [HP:0000924],35/35,100%,103/103,100%,, -Abnormality of the musculoskeletal system [HP:0033127],35/35,100%,104/104,100%,, -Abnormal appendicular skeleton morphology [HP:0011844],34/34,100%,93/93,100%,, -Abnormal skeletal morphology [HP:0011842],35/35,100%,103/103,100%,, -Upper limb undergrowth [HP:0009824],7/7,100%,38/38,100%,, -Limb undergrowth [HP:0009826],7/7,100%,38/38,100%,, -Aplasia/hypoplasia of the extremities [HP:0009815],22/22,100%,78/78,100%,, -Aplasia/hypoplasia involving the skeleton [HP:0009115],23/23,100%,80/80,100%,, -Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],22/22,100%,78/78,100%,, -Aplasia/hypoplasia involving bones of the extremities [HP:0045060],22/22,100%,78/78,100%,, -Aplasia/hypoplasia involving forearm bones [HP:0006503],12/12,100%,43/43,100%,, -Abnormal forearm bone morphology [HP:0040072],14/14,100%,43/43,100%,, -Abnormal forearm morphology [HP:0002973],14/14,100%,43/43,100%,, -Short long bone [HP:0003026],9/9,100%,41/41,100%,, -Abnormal long bone morphology [HP:0011314],13/13,100%,50/50,100%,, -Aplasia/Hypoplasia of the radius [HP:0006501],11/11,100%,43/43,100%,, -Abnormal morphology of the radius [HP:0002818],13/13,100%,43/43,100%,, -1-2 finger syndactyly [HP:0010704],1/1,100%,4/4,100%,, -Finger syndactyly [HP:0006101],2/2,100%,5/5,100%,, -Syndactyly [HP:0001159],2/2,100%,5/5,100%,, -Abnormal digit morphology [HP:0011297],33/33,100%,67/67,100%,, -Abnormal atrial septum morphology [HP:0011994],20/20,100%,64/64,100%,, -Abnormal cardiac atrium morphology [HP:0005120],20/20,100%,64/64,100%,, -Perimembranous ventricular septal defect [HP:0011682],3/25,12%,6/84,7%,, -Aplasia/Hypoplasia of the thumb [HP:0009601],19/19,100%,40/40,100%,, -Aplasia/Hypoplasia of fingers [HP:0006265],19/19,100%,44/44,100%,, -Aplasia/hypoplasia involving bones of the hand [HP:0005927],19/19,100%,44/44,100%,, -Abnormal hand morphology [HP:0005922],20/20,100%,75/75,100%,, -Abnormality of the hand [HP:0001155],31/31,100%,88/88,100%,, -Abnormal finger morphology [HP:0001167],31/31,100%,64/64,100%,, -Abnormal thumb morphology [HP:0001172],31/31,100%,58/58,100%,, -Finger aplasia [HP:0009380],14/14,100%,23/23,100%,, -Aplasia involving forearm bones [HP:0009822],6/6,100%,9/9,100%,, -Absent forearm bone [HP:0003953],6/6,100%,9/9,100%,, -Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,4/4,100%,, -Abnormal morphology of ulna [HP:0040071],4/4,100%,4/4,100%,, -Abnormal carpal morphology [HP:0001191],0/0,0%,30/32,94%,, -Abnormality of the wrist [HP:0003019],0/0,0%,30/30,100%,, -Abnormality of upper limb joint [HP:0009810],6/6,100%,32/32,100%,, -Abnormal joint morphology [HP:0001367],6/6,100%,33/33,100%,, -Abnormality of cardiovascular system electrophysiology [HP:0030956],3/3,100%,18/18,100%,, -Abnormal cardiovascular system physiology [HP:0011025],5/5,100%,30/30,100%,, -Abnormality of thumb phalanx [HP:0009602],13/13,100%,26/26,100%,, -Upper limb phocomelia [HP:0009813],2/37,5%,8/116,7%,, -Phocomelia [HP:0009829],2/2,100%,8/8,100%,, -Short finger [HP:0009381],8/8,100%,27/27,100%,, -Short digit [HP:0011927],10/10,100%,28/28,100%,, -Pre-capillary pulmonary hypertension [HP:0033578],0/0,0%,8/8,100%,, -Elevated pulmonary artery pressure [HP:0004890],0/0,0%,8/8,100%,, -Abnormality of pulmonary circulation [HP:0030875],0/0,0%,8/8,100%,, -Abnormal vascular physiology [HP:0030163],0/0,0%,8/8,100%,, -Abnormality of the vasculature [HP:0002597],2/2,100%,17/17,100%,, -Abnormal respiratory system physiology [HP:0002795],0/0,0%,8/8,100%,, -Abnormality of the respiratory system [HP:0002086],0/0,0%,8/8,100%,, -Congenital malformation of the great arteries [HP:0011603],2/2,100%,7/7,100%,, -Abnormal morphology of the great vessels [HP:0030962],2/2,100%,10/10,100%,, -Abnormal blood vessel morphology [HP:0033353],2/2,100%,11/11,100%,, -Abnormal vascular morphology [HP:0025015],2/2,100%,11/11,100%,, -Patent foramen ovale [HP:0001655],0/36,0%,4/69,6%,, -Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,, -Abnormality of joint mobility [HP:0011729],5/5,100%,3/3,100%,, -Abnormal joint physiology [HP:0034430],5/5,100%,3/3,100%,, -Abnormal musculoskeletal physiology [HP:0011843],5/5,100%,3/3,100%,, -Abnormality of the vertebral column [HP:0000925],1/1,100%,4/4,100%,, -Abnormal axial skeleton morphology [HP:0009121],5/5,100%,9/9,100%,, -Complete atrioventricular canal defect [HP:0001674],3/36,8%,6/67,9%,, -Atrioventricular canal defect [HP:0006695],3/3,100%,6/6,100%,, -Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,6/6,100%,, -Shoulder muscle hypoplasia [HP:0008952],0/0,0%,6/6,100%,, -Hypoplasia of the musculature [HP:0009004],0/0,0%,6/6,100%,, -Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,6/6,100%,, -Abnormality of muscle size [HP:0030236],0/0,0%,6/6,100%,, -Abnormal skeletal muscle morphology [HP:0011805],0/0,0%,8/8,100%,, -Abnormality of the musculature [HP:0003011],0/0,0%,8/8,100%,, -Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,6/6,100%,, -Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,6/6,100%,, -Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,6/6,100%,, -Abnormality of the musculature of the limbs [HP:0009127],0/0,0%,8/8,100%,, -Abnormality of the musculature of the upper limbs [HP:0001446],0/0,0%,8/8,100%,, -Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,6/6,100%,, -Bowed forearm bones [HP:0003956],1/1,100%,0/0,0%,, -Bowing of the arm [HP:0006488],1/1,100%,0/0,0%,, -Bowing of the long bones [HP:0006487],1/1,100%,0/0,0%,, -Abnormal diaphysis morphology [HP:0000940],1/1,100%,0/0,0%,, -Micrognathia [HP:0000347],1/1,100%,1/1,100%,, -Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,, -Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,, -Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,3/3,100%,, -Abnormal skull morphology [HP:0000929],2/2,100%,1/1,100%,, -Abnormality of the head [HP:0000234],2/2,100%,5/5,100%,, -Abnormality of head or neck [HP:0000152],2/2,100%,5/5,100%,, -Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,, -Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,, -Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,, -Abnormal oral cavity morphology [HP:0000163],1/1,100%,5/5,100%,, -Abnormal oral morphology [HP:0031816],1/1,100%,5/5,100%,, -Abnormality of the mouth [HP:0000153],1/1,100%,5/5,100%,, -Abnormality of the face [HP:0000271],1/1,100%,5/5,100%,, -Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],2/2,100%,2/2,100%,, -Abnormal thorax morphology [HP:0000765],5/5,100%,7/7,100%,, -Persistent left superior vena cava [HP:0005301],0/0,0%,4/39,10%,, -Abnormal superior vena cava morphology [HP:0025575],0/0,0%,4/4,100%,, -Abnormal vena cava morphology [HP:0005345],0/0,0%,4/4,100%,, -Abnormal venous morphology [HP:0002624],0/0,0%,4/4,100%,, -Aplasia/hypoplasia of the humerus [HP:0006507],4/4,100%,8/8,100%,, -Abnormality of the humerus [HP:0003063],4/4,100%,8/8,100%,, -Abnormality of the upper arm [HP:0001454],4/4,100%,8/8,100%,, -Right atrial enlargement [HP:0030718],0/0,0%,4/4,100%,, -Abnormal right atrium morphology [HP:0025580],0/0,0%,4/4,100%,, -Atrial septal dilatation [HP:0011995],0/0,0%,4/4,100%,, -Left ventricular noncompaction cardiomyopathy [HP:0011664],1/5,20%,1/7,14%,, -Noncompaction cardiomyopathy [HP:0012817],1/1,100%,1/1,100%,, -Cardiomyopathy [HP:0001638],1/1,100%,1/1,100%,, -Abnormal myocardium morphology [HP:0001637],1/1,100%,1/1,100%,, -Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,, -Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,, -Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,, -Abnormal toe morphology [HP:0001780],0/0,0%,1/1,100%,, -Abnormal foot morphology [HP:0001760],0/0,0%,4/4,100%,, -Abnormality of the lower limb [HP:0002814],0/0,0%,4/4,100%,, -Abnormal toe phalanx morphology [HP:0010161],0/0,0%,1/1,100%,, -Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,1/1,100%,, -Abnormal lower limb bone morphology [HP:0040069],0/0,0%,4/4,100%,, -Aplasia involving bones of the extremities [HP:0009825],0/0,0%,3/3,100%,, -Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,3/3,100%,, -Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,3/3,100%,, -Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,3/3,100%,, -Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,3/3,100%,, -Abnormal finger phalanx morphology [HP:0005918],0/0,0%,9/9,100%,, -Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,6/6,100%,, -Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,3/3,100%,, -Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,3/3,100%,, -Aplasia of metacarpal bones [HP:0010048],0/0,0%,3/3,100%,, -Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,4/4,100%,, -Abnormal metacarpal morphology [HP:0005916],0/0,0%,4/4,100%,, -Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,4/4,100%,, -Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,4/4,100%,, -Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,4/4,100%,, -Partial absence of thumb [HP:0009659],0/0,0%,3/3,100%,, -Limited pronation/supination of forearm [HP:0006394],3/3,100%,2/2,100%,, -Limited elbow movement [HP:0002996],4/4,100%,2/2,100%,, -Abnormality of the elbow [HP:0009811],5/5,100%,2/2,100%,, -Limitation of joint mobility [HP:0001376],4/4,100%,2/2,100%,, -Mitral regurgitation [HP:0001653],2/2,100%,2/2,100%,, -Atrioventricular valve regurgitation [HP:0034376],2/2,100%,7/7,100%,, -Abnormal atrioventricular valve physiology [HP:0031650],2/2,100%,7/7,100%,, -Abnormal heart valve physiology [HP:0031653],2/2,100%,7/7,100%,, -Abnormal mitral valve physiology [HP:0031481],2/2,100%,2/2,100%,, -Tricuspid regurgitation [HP:0005180],0/0,0%,5/5,100%,, -Abnormal tricuspid valve physiology [HP:0031651],0/0,0%,5/5,100%,, -Cleft soft palate [HP:0000185],0/0,0%,2/2,100%,, -Abnormal soft palate morphology [HP:0100736],0/0,0%,2/2,100%,, -Abnormal palate morphology [HP:0000174],0/0,0%,5/5,100%,, -Cleft palate [HP:0000175],0/0,0%,2/2,100%,, -Orofacial cleft [HP:0000202],0/0,0%,2/2,100%,, -Craniofacial cleft [HP:5201015],0/0,0%,2/2,100%,, -Sinus bradycardia [HP:0001688],1/1,100%,2/2,100%,, -Abnormal electrophysiology of sinoatrial node origin [HP:0011702],1/1,100%,2/2,100%,, -Arrhythmia [HP:0011675],1/1,100%,3/3,100%,, -Bradycardia [HP:0001662],1/1,100%,2/2,100%,, -Pectus excavatum [HP:0000767],2/2,100%,3/4,75%,, -Abnormal sternum morphology [HP:0000766],2/2,100%,3/3,100%,, -Deviation of the thumb [HP:0009603],2/2,100%,3/3,100%,, -Deviation of finger [HP:0004097],2/2,100%,4/4,100%,, -Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,5/5,100%,, -Abnormal skin morphology [HP:0011121],1/1,100%,0/0,0%,, -Abnormality of the skin [HP:0000951],1/1,100%,0/0,0%,, -Abnormality of the integument [HP:0001574],1/1,100%,0/0,0%,, -Sinus venosus atrial septal defect [HP:0011567],1/1,100%,0/2,0%,, -First degree atrioventricular block [HP:0011705],1/1,100%,1/23,4%,, -Postaxial hand polydactyly [HP:0001162],0/0,0%,3/4,75%,, -Postaxial polydactyly [HP:0100259],0/0,0%,3/3,100%,, -Polydactyly [HP:0010442],0/0,0%,3/3,100%,, -Abnormal 5th finger morphology [HP:0004207],0/0,0%,6/6,100%,, -Hand polydactyly [HP:0001161],0/0,0%,3/3,100%,, -Duplication of phalanx of hand [HP:0009997],0/0,0%,3/3,100%,, -Duplication of hand bones [HP:0004275],0/0,0%,3/3,100%,, -Duplication of bones involving the upper extremities [HP:0009142],0/0,0%,3/3,100%,, -High palate [HP:0000218],0/0,0%,3/3,100%,, -Short neck [HP:0000470],0/0,0%,3/3,100%,, -Abnormal neck morphology [HP:0025668],0/0,0%,3/3,100%,, -Abnormality of the neck [HP:0000464],0/0,0%,3/3,100%,, -Abnormality of the cervical spine [HP:0003319],0/0,0%,3/3,100%,, -Shield chest [HP:0000914],0/0,0%,3/3,100%,, -Enlarged thorax [HP:0100625],0/0,0%,3/3,100%,, -Abnormal rib cage morphology [HP:0001547],0/0,0%,5/5,100%,, -Y-shaped metatarsals [HP:0010567],0/0,0%,3/3,100%,, -Abnormal metatarsal morphology [HP:0001832],0/0,0%,3/3,100%,, -11 pairs of ribs [HP:0000878],0/0,0%,2/2,100%,, -Missing ribs [HP:0000921],0/0,0%,2/2,100%,, -Aplasia/Hypoplasia of the ribs [HP:0006712],0/0,0%,2/2,100%,, -Abnormal rib morphology [HP:0000772],0/0,0%,2/2,100%,, -Aplasia/Hypoplasia of the 2nd finger [HP:0006264],0/0,0%,2/2,100%,, -Abnormal 2nd finger morphology [HP:0004100],0/0,0%,2/2,100%,, -Small hypothenar eminence [HP:0010487],0/0,0%,2/2,100%,, -Abnormality of the hypothenar eminence [HP:0010486],0/0,0%,2/2,100%,, -Abnormality of the musculature of the hand [HP:0001421],0/0,0%,2/2,100%,, -Upper extremity joint dislocation [HP:0030310],2/2,100%,0/0,0%,, -Joint dislocation [HP:0001373],2/2,100%,0/0,0%,, -Amelia involving the upper limbs [HP:0009812],1/37,3%,0/114,0%,, -Third degree atrioventricular block [HP:0001709],1/1,100%,0/22,0%,, -Atrioventricular dissociation [HP:0011709],1/1,100%,0/22,0%,, -Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,, -Proximal placement of thumb [HP:0009623],0/0,0%,3/3,100%,, -Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,2/2,100%,, -Type A brachydactyly [HP:0009370],0/0,0%,2/2,100%,, -Brachydactyly [HP:0001156],0/0,0%,2/2,100%,, -Short 5th finger [HP:0009237],0/0,0%,2/2,100%,, -Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,2/2,100%,, -Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,2/2,100%,, -Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,2/2,100%,, -Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,2/2,100%,, -Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,2/2,100%,, -Short middle phalanx of finger [HP:0005819],0/0,0%,2/2,100%,, -Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,2/2,100%,, -Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,2/2,100%,, -Short phalanx of finger [HP:0009803],0/0,0%,2/2,100%,, -Common atrium [HP:0011565],0/38,0%,1/115,1%,, -Unroofed coronary sinus [HP:0031297],0/38,0%,1/117,1%,, -Clinodactyly of the 5th finger [HP:0004209],0/0,0%,2/2,100%,, -Finger clinodactyly [HP:0040019],0/0,0%,2/2,100%,, -Clinodactyly [HP:0030084],0/0,0%,2/2,100%,, -Deviation of the 5th finger [HP:0009179],0/0,0%,2/2,100%,, -Mitral valve prolapse [HP:0001634],1/1,100%,1/1,100%,, -Abnormal mitral valve morphology [HP:0001633],1/1,100%,1/1,100%,, -Abnormal atrioventricular valve morphology [HP:0006705],1/1,100%,1/1,100%,, -Abnormal heart valve morphology [HP:0001654],1/1,100%,1/1,100%,, -Short 1st metacarpal [HP:0010034],0/22,0%,1/45,2%,, -Short phalanx of the thumb [HP:0009660],0/22,0%,1/45,2%,, +Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 +Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 +Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.021044138590779585,0.003713671516019927 +Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.034,0.01 +Heart block [HP:0012722],1/23,4%,2/2,100%,0.034,0.01 +Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.09214092140921408,0.03252032520325203 +Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.1440020479198931,0.06544319142266644 +Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.1440020479198931,0.06932119159387057 +Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.1440020479198931,0.08259109311740892 +Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.1440020479198931,0.08470708701170182 +Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.6899307928951143,0.4666666666666667 +Short thumb [HP:0009778],25/69,36%,8/30,27%,0.6899307928951143,0.48700997145537483 +Absent radius [HP:0003974],9/43,21%,6/25,24%,1.0,0.7703831604944444 +Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 +Hypoplasia of the radius [HP:0002984],34/75,45%,6/14,43%,1.0,1.0 +Hypoplasia of the ulna [HP:0003022],3/17,18%,2/10,20%,1.0,1.0 +Short humerus [HP:0005792],8/21,38%,4/9,44%,1.0,1.0 +Abnormal atrial septum morphology [HP:0011994],64/64,100%,20/20,100%,, +Abnormal cardiac septum morphology [HP:0001671],89/89,100%,28/28,100%,, +Abnormal heart morphology [HP:0001627],89/89,100%,30/30,100%,, diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 3eeca2bfc..96e9376e0 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -122,13 +122,34 @@ We want to separate the patients into two groups: a group *with* a frameshift va and a group *without* a frameshift variant, based on the functional annotation. We will use the *MANE* transcript for the analysis: +Building a genotype predicate is a two step process. +First, we create a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +to test if the variant leads to a frameshift (in this case): + >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate >>> tx_id = 'NM_181486.4' ->>> gt_predicate = boolean_predicate(VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id)) ->>> gt_predicate.get_question() +>>> is_frameshift = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) +>>> is_frameshift.get_question() 'FRAMESHIFT_VARIANT on NM_181486.4' +and then we choose the expected mode of inheritance to test. In case of *TBX5*, +we expect the autosomal dominant mode of inheritance: + +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) +>>> gt_predicate.get_question() +'Which genotype group does the patient fit in: HOM_REF, HET' + +`gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group +and the patients with one frameshift allele will be assigned into `HET` group. +Note, any patient with 2 or more alleles will be *omitted* from the analysis. + +.. note:: + + Mode of inheritance testing is not the only way to dissect by a genotype. + See the :ref:`genotype-predicates` section for more info. + **Phenotype predicates** @@ -229,18 +250,32 @@ We can learn more by showing the MTC filter report: Genotype phenotype associations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Last, let's explore the associations. This is a table of the tested HPO terms -ordered by the corrected p value (Benjamini-Hochberg FDR): +Last, let's explore the associations. The results include a table with all tested HPO terms +ordered by the corrected p value (Benjamini-Hochberg FDR). +Here we show the top 20 table rows: >>> from gpsea.analysis.predicate import PatientCategories >>> summary_df = result.summarize(hpo, PatientCategories.YES) ->>> summary_df.to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP +>>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest :file: report/tbx5_frameshift.csv :header-rows: 2 +The table shows that several HPO terms are significantly associated +with presence of a heterozygous (`HET`) frameshift variant in *TBX5*. +For example, `Ventricular septal defect `_ +was observed in 31/60 (52%) patients with a missense variant +but it was observed in 19/19 (100%) patients with a frameshift variant. +Fisher exact test computed a p value of `~0.000242` +and the p value corrected by Benjamini-Hochberg procedure +is `~0.00411`. + +The table includes all HPO terms of the cohort, including the terms that were not selected for testing +and thus have no associated p value. + + .. _phenotype-score-stats: *************** diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 1cd1a6577..a032aae95 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -273,24 +273,42 @@ class MendelianInheritanceAspect(enum.Enum): class ModeOfInheritanceInfo: + # NOT PART OF THE PUBLIC API!!! + + HOM_REF = Categorization( + PatientCategory( + cat_id=0, name="HOM_REF", description="Homozygous reference", + ), + ) + HET = Categorization( + PatientCategory( + cat_id=1, name="HET", description="Heterozygous", + ), + ) + BIALLELIC_ALT = Categorization( + PatientCategory( + cat_id=2, name="BIALLELIC_ALT", + description="Homozygous alternate or compound heterozygous", + ), + ) + HEMI = Categorization( + PatientCategory( + cat_id=3, name="HEMI", description="Hemizygous", + ), + ) + @staticmethod def autosomal_dominant() -> "ModeOfInheritanceInfo": groups = ( GenotypeGroup( allele_count=0, sex=None, - categorization=Categorization( - PatientCategory( - cat_id=0, name="0/0", description="Homozygous reference" - ), - ), + categorization=ModeOfInheritanceInfo.HOM_REF, ), GenotypeGroup( allele_count=1, sex=None, - categorization=Categorization( - PatientCategory(cat_id=1, name="0/1", description="Heterozygous"), - ), + categorization=ModeOfInheritanceInfo.HET, ), ) return ModeOfInheritanceInfo( @@ -304,25 +322,17 @@ def autosomal_recessive() -> "ModeOfInheritanceInfo": GenotypeGroup( allele_count=0, sex=None, - categorization=Categorization( - PatientCategory( - cat_id=0, name="0/0", description="Homozygous reference" - ), - ), + categorization=ModeOfInheritanceInfo.HOM_REF, ), GenotypeGroup( allele_count=1, sex=None, - categorization=Categorization( - PatientCategory(cat_id=1, name="0/1", description="Heterozygous"), - ), + categorization=ModeOfInheritanceInfo.HET, ), GenotypeGroup( allele_count=2, sex=None, - categorization=Categorization( - PatientCategory(cat_id=2, name="1/1", description="Homozygous alternate"), - ), + categorization=ModeOfInheritanceInfo.BIALLELIC_ALT, ), ) return ModeOfInheritanceInfo( @@ -336,25 +346,12 @@ def x_dominant() -> "ModeOfInheritanceInfo": GenotypeGroup( allele_count=0, sex=None, - categorization=Categorization( - PatientCategory( - cat_id=0, name="0", description="Homozygous reference" - ), - ), - ), - GenotypeGroup( - allele_count=1, - sex=Sex.FEMALE, - categorization=Categorization( - PatientCategory(cat_id=1, name="0/1", description="Heterozygous"), - ), + categorization=ModeOfInheritanceInfo.HOM_REF, ), GenotypeGroup( allele_count=1, - sex=Sex.MALE, - categorization=Categorization( - PatientCategory(cat_id=2, name="1", description="Hemizygous"), - ), + sex=None, + categorization=ModeOfInheritanceInfo.HET, ), ) return ModeOfInheritanceInfo( @@ -368,25 +365,22 @@ def x_recessive() -> "ModeOfInheritanceInfo": GenotypeGroup( allele_count=0, sex=None, - categorization=Categorization( - PatientCategory( - cat_id=0, name="0/0", description="Homozygous reference" - ), - ), + categorization=ModeOfInheritanceInfo.HOM_REF, ), GenotypeGroup( allele_count=1, sex=Sex.FEMALE, - categorization=Categorization( - PatientCategory(cat_id=1, name="0/1", description="Heterozygous"), - ), + categorization=ModeOfInheritanceInfo.HET, + ), + GenotypeGroup( + allele_count=2, + sex=Sex.FEMALE, + categorization=ModeOfInheritanceInfo.BIALLELIC_ALT, ), GenotypeGroup( allele_count=1, sex=Sex.MALE, - categorization=Categorization( - PatientCategory(cat_id=2, name="1", description="Hemizygous"), - ), + categorization=ModeOfInheritanceInfo.HEMI, ), ) @@ -400,7 +394,8 @@ def __init__( mendelian_inheritance_aspect: MendelianInheritanceAspect, groups: typing.Iterable[GenotypeGroup], ): - # We pre-compute the hash manually. + # We want this to be hashable but also keep a non-hashable dict + # as a field. Therefore, we pre-compute the hash manually. # The correctness depends on two default dicts with same keys and values # comparing equal. hash_value = 17 diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py index 0d541a65d..09a1c913d 100644 --- a/src/gpsea/model/_base.py +++ b/src/gpsea/model/_base.py @@ -13,17 +13,17 @@ class Sex(enum.Enum): UNKNOWN_SEX = 0 """ - Not assessed or not available. Maps to `NCIT:C17998`. + Not assessed or not available. Maps to ``NCIT:C17998``. """ FEMALE = 1 """ - Female sex. Maps to `NCIT:C46113`. + Female sex. Maps to ``NCIT:C46113``. """ MALE = 2 """ - Male sex. Maps to `NCIT:C46112`. + Male sex. Maps to ``NCIT:C46112``. """ def is_provided(self) -> bool: diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py index c44a214b8..7138e93ff 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/predicate/genotype/conftest.py @@ -211,10 +211,10 @@ def patient_w_frameshift( """ -Genesis family - Autosomal dominant +Genesis family - Autosomal dominant but can also be used as X dominant. -* Adam - father, affected -* Eve - mother, unaffected +* Adam - father, unaffected +* Eve - mother, affected * Cain - son, affected """ @@ -260,8 +260,8 @@ def genesis_mutation( ), genotypes=Genotypes.from_mapping( { - adam_label: Genotype.HETEROZYGOUS, - eve_label: Genotype.HOMOZYGOUS_REFERENCE, + adam_label: Genotype.HOMOZYGOUS_REFERENCE, + eve_label: Genotype.HETEROZYGOUS, cain_label: Genotype.HETEROZYGOUS, } ), @@ -597,8 +597,3 @@ def leia( diseases=(), variants=(skywalker_mutation,), ) - - -""" -XR family -""" diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index ac6977c8e..36904159f 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -84,17 +84,16 @@ def variant_predicate(self) -> VariantPredicate: return VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) @pytest.mark.parametrize( - "patient_name,cat_id,name", + "patient_name,name", [ - ("adam", 1, "0/1"), - ("eve", 0, "0/0"), - ("cain", 1, "0/1"), + ("adam", "HOM_REF"), + ("eve", "HET"), + ("cain", "HET"), ], ) def test_autosomal_dominant( self, patient_name: str, - cat_id: int, name: str, variant_predicate: VariantPredicate, request: pytest.FixtureRequest, @@ -106,22 +105,20 @@ def test_autosomal_dominant( assert categorization is not None - assert categorization.category.cat_id == cat_id assert categorization.category.name == name @pytest.mark.parametrize( - "patient_name,cat_id,name", + "patient_name,name", [ - ("walt", 1, "0/1"), - ("skyler", 1, "0/1"), - ("flynn", 2, "1/1"), - ("holly", 0, "0/0"), + ("walt", "HET"), + ("skyler", "HET"), + ("flynn", "BIALLELIC_ALT"), + ("holly", "HOM_REF"), ], ) def test_autosomal_recessive( self, patient_name: str, - cat_id: int, name: str, variant_predicate: VariantPredicate, request: pytest.FixtureRequest, @@ -133,22 +130,44 @@ def test_autosomal_recessive( assert categorization is not None - assert categorization.category.cat_id == cat_id assert categorization.category.name == name @pytest.mark.parametrize( - "patient_name,cat_id,name", + "patient_name,name", [ - ("anakin", 0, "0/0"), - ("padme", 1, "0/1"), - ("luke", 2, "1"), - ("leia", 1, "0/1"), + ("adam", "HOM_REF"), + ("eve", "HET"), + ("cain", "HET"), + ], + ) + def test_x_dominant( + self, + patient_name: str, + name: str, + variant_predicate: VariantPredicate, + request: pytest.FixtureRequest, + ): + patient = request.getfixturevalue(patient_name) + predicate = ModeOfInheritancePredicate.x_dominant(variant_predicate) + + categorization = predicate.test(patient) + + assert categorization is not None + + assert categorization.category.name == name + + @pytest.mark.parametrize( + "patient_name,name", + [ + ("anakin", "HOM_REF"), + ("padme", "HET"), + ("luke", "HEMI"), + ("leia", "HET"), ], ) def test_x_recessive( self, patient_name: str, - cat_id: int, name: str, variant_predicate: VariantPredicate, request: pytest.FixtureRequest, @@ -160,5 +179,4 @@ def test_x_recessive( assert categorization is not None - assert categorization.category.cat_id == cat_id assert categorization.category.name == name From 7acfc72dbaace44ad12e1107b869e47294b4ca08 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sun, 1 Sep 2024 23:24:19 +0200 Subject: [PATCH 08/16] Check compatibility between the predicates and the count statistic. --- docs/tutorial.rst | 4 +- docs/user-guide/mtc.rst | 4 +- docs/user-guide/stats.rst | 4 +- src/gpsea/analysis/pcats/_impl.py | 74 ++++++++++++++- src/gpsea/analysis/pcats/stats/__init__.py | 4 +- src/gpsea/analysis/pcats/stats/_stats.py | 90 ++++++++++++------- .../analysis/pcats/stats/_test__stats.py | 9 +- tests/analysis/pcats/test_disease.py | 4 +- .../analysis/pcats/test_hpo_term_analysis.py | 4 +- tests/test_tutorial.py | 4 +- 10 files changed, 147 insertions(+), 54 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 8d1d47268..e047943ce 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -224,8 +224,8 @@ with a false discovery control level at (``mtc_alpha=0.05``): Choosing the statistical procedure for assessment of association between genotype and phenotype groups is the last missing piece of the analysis. We will use Fisher Exact Test: ->>> from gpsea.analysis.pcats.stats import ScipyFisherExact ->>> count_statistic = ScipyFisherExact() +>>> from gpsea.analysis.pcats.stats import FisherExactTest +>>> count_statistic = FisherExactTest() and we finalize the analysis setup by putting all components together into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`: diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index f8958b8ae..070539d19 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -96,9 +96,9 @@ when creating an instance of :class:`~gpsea.analysis.pcats.HpoTermAnalysis`: >>> from gpsea.analysis.mtc_filter import UseAllTermsMtcFilter >>> from gpsea.analysis.pcats import HpoTermAnalysis ->>> from gpsea.analysis.pcats.stats import ScipyFisherExact +>>> from gpsea.analysis.pcats.stats import FisherExactTest >>> analysis = HpoTermAnalysis( -... count_statistic=ScipyFisherExact(), +... count_statistic=FisherExactTest(), ... mtc_filter=UseAllTermsMtcFilter(), ... mtc_correction='bonferroni', # <--- The MTC correction setup ... ) diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 96e9376e0..16f0060b7 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -176,8 +176,8 @@ including the *indirect* annotations whose presence is implied by the true path We will use :ref: to test the association between genotype and phenotype groups, as described previously. ->>> from gpsea.analysis.pcats.stats import ScipyFisherExact ->>> count_statistic = ScipyFisherExact() +>>> from gpsea.analysis.pcats.stats import FisherExactTest +>>> count_statistic = FisherExactTest() FET will compute a p value for each genotype phenotype group. diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index e4513fa0b..6600dab46 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -1,5 +1,6 @@ import abc import math +import os import typing from collections import Counter @@ -273,20 +274,87 @@ def __init__( (e.g. Bonferroni MTC) or false discovery rate for the FDR procedures (e.g. Benjamini-Hochberg). """ assert isinstance(count_statistic, CountStatistic) + assert len(count_statistic.supports_shape) == 2, "The statistic must support 2D contingency tables" self._count_statistic = count_statistic self._mtc_correction = mtc_correction assert isinstance(mtc_alpha, float) and 0. <= mtc_alpha <= 1. self._mtc_alpha = mtc_alpha - @abc.abstractmethod def compare_genotype_vs_phenotypes( self, cohort: typing.Iterable[Patient], gt_predicate: GenotypePolyPredicate, pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + ) -> MultiPhenotypeAnalysisResult[P]: + # Check compatibility between the count statistic and predicate. + issues = MultiPhenotypeAnalysis._check_compatibility( + count_statistic=self._count_statistic, + gt_predicate=gt_predicate, + pheno_predicates=pheno_predicates, + ) + if len(issues) != 0: + msg = os.linesep.join(issues) + raise ValueError(f'Cannot execute the analysis: {msg}') + + return self._compute_result( + cohort=cohort, + gt_predicate=gt_predicate, + pheno_predicates=pheno_predicates, + ) + + @abc.abstractmethod + def _compute_result( + self, + cohort: typing.Iterable[Patient], + gt_predicate: GenotypePolyPredicate, + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], ) -> MultiPhenotypeAnalysisResult[P]: pass + @staticmethod + def _check_compatibility( + count_statistic: CountStatistic, + gt_predicate: GenotypePolyPredicate, + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + ) -> typing.Collection[str]: + # There should be 2 items due to check in `__init__`. + (pheno, geno) = count_statistic.supports_shape + + issues = [] + # Check phenotype + if isinstance(pheno, int): + pheno_accepted = (pheno,) + elif isinstance(pheno, typing.Sequence): + pheno_accepted = pheno + else: + issues.append('Cannot use a count statistic that does not check phenotypes') + + pheno_failed = [] + for i, ph_predicate in enumerate(pheno_predicates): + if ph_predicate.n_categorizations not in pheno_accepted: + pheno_failed.append(i) + if len(pheno_failed) != 0: + issues.append( + 'Phenotype predicates {} are incompatible with the count statistic'.format( + ', '.join(str(i) for i in pheno_failed) + ) + ) + + # Check genotype + if isinstance(geno, int): + geno_accepted = (geno,) + elif isinstance(geno, typing.Sequence): + geno_accepted = geno + elif pheno is None: + raise ValueError('Cannot use a count statistic that does not check genotypes') + else: + raise ValueError(f'Cannot use a count statistic that supports shape {pheno, geno}') + + if gt_predicate.n_categorizations not in geno_accepted: + issues.append('Genotype predicate is incompatible with the count statistic') + + return issues + def _compute_nominal_pvals( self, n_usable: typing.Iterable[int], @@ -380,7 +448,7 @@ def __hash__(self): class DiseaseAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]): - def compare_genotype_vs_phenotypes( + def _compute_result( self, cohort: typing.Iterable[Patient], gt_predicate: GenotypePolyPredicate, @@ -513,7 +581,7 @@ def __init__( assert isinstance(mtc_filter, PhenotypeMtcFilter) self._mtc_filter = mtc_filter - def compare_genotype_vs_phenotypes( + def _compute_result( self, cohort: typing.Iterable[Patient], gt_predicate: GenotypePolyPredicate, diff --git a/src/gpsea/analysis/pcats/stats/__init__.py b/src/gpsea/analysis/pcats/stats/__init__.py index 4d7e05437..a535129d4 100644 --- a/src/gpsea/analysis/pcats/stats/__init__.py +++ b/src/gpsea/analysis/pcats/stats/__init__.py @@ -1,5 +1,5 @@ -from ._stats import CountStatistic, ScipyFisherExact, PythonMultiFisherExact +from ._stats import CountStatistic, FisherExactTest __all__ = [ - 'CountStatistic', 'ScipyFisherExact', 'PythonMultiFisherExact', + 'CountStatistic', 'FisherExactTest', ] diff --git a/src/gpsea/analysis/pcats/stats/_stats.py b/src/gpsea/analysis/pcats/stats/_stats.py index 8df07fdcd..981176ee1 100644 --- a/src/gpsea/analysis/pcats/stats/_stats.py +++ b/src/gpsea/analysis/pcats/stats/_stats.py @@ -1,5 +1,7 @@ import abc import math +import typing + from decimal import Decimal @@ -14,9 +16,45 @@ class CountStatistic(metaclass=abc.ABCMeta): `CountStatistic` calculates a p value for a contingency table produced by a pair of discrete random variables. - The `counts` table is usually `2x2` or `2x3`. + + Supports shape + ^^^^^^^^^^^^^^ + + `CountStatistic` takes the counts in form of a data frame, + and some statistics impose additional requirements on the frame shape. + For instance, GPSEA's implementation of the Fisher exact test + can compare counts in a ``(2, 2)`` or ``(2, 3)`` arrays + but χ2 test can test an ``(m, n)`` array. + + It is important to check that a genotype/phenotype predicate produces + the number of groups which the statistic can test. + + The :attr:`supports_shape` returns a sequence with requirements + on the shape of the data array/frame. The sequence includes + the number of + + Examples + ******** + + +------------------------+-------------------------+------------------+ + | Test | Array shape | `supports_shape` | + +========================+=========================+==================+ + | Fisher Exact Test | ``(2, [2, 3])`` | ``(2, [2,3])`` | + +------------------------+-------------------------+------------------+ + | χ2 | ``(*, *)`` | ``(None, None)`` | + +------------------------+-------------------------+------------------+ """ + @property + @abc.abstractmethod + def supports_shape( + self, + ) -> typing.Sequence[typing.Union[int, typing.Sequence[int], None]]: + """ + Get a sequence of the supported shapes. + """ + pass + @abc.abstractmethod def compute_pval( self, @@ -25,47 +63,35 @@ def compute_pval( pass -class ScipyFisherExact(CountStatistic): +class FisherExactTest(CountStatistic): """ - `ScipyFisherExact` performs Fisher Exact Test on a `2x2` contingency table. + `FisherExactTest` performs Fisher Exact Test on a `2x2` or `2x3` contingency table. - The class is a thin wrapper around Scipy :func:`~scipy.stats.fisher_exact` function. - The two-sided :math:`H_1` is considered. + The `2x2` version is a thin wrapper around Scipy :func:`~scipy.stats.fisher_exact` function, + while the `2x3` variant is implemented in Python. + In both variants, the two-sided :math:`H_1` is considered. """ + + def __init__(self): + self._shape = (2, (2, 3)) - def compute_pval( + @property + def supports_shape( self, - counts: pd.DataFrame, - ) -> float: - assert counts.shape == ( - 2, - 2, - ), f"Cannot run Fisher exact test on an array with {counts.shape} shape" - _, pval = scipy.stats.fisher_exact(counts.values, alternative="two-sided") - return pval - - -class PythonMultiFisherExact(CountStatistic): - """ - `PythonMultiFisherExact` is a Python implementation of Fisher Exact Test to compute - p value for a `2x3` contingency table. - """ + ) -> typing.Sequence[typing.Union[int, typing.Sequence[int], None]]: + return self._shape def compute_pval( self, counts: pd.DataFrame, ) -> float: - PythonMultiFisherExact._check_input(counts) - return self._fisher_exact(counts.values) - - @staticmethod - def _check_input(a: pd.DataFrame): - if not isinstance(a, pd.DataFrame): - raise ValueError(f"Expected a pandas DataFrame but got {type(a)}") - if not a.shape == (2, 3): - raise ValueError(f"Shape of the array must be (2, 3) but got {a.shape}") - if np.array_equal(a.values, np.zeros_like(a)): - raise ValueError("Data frame is all zeros, cannot run analysis") + if counts.shape == (2, 2): + _, pval = scipy.stats.fisher_exact(counts.values, alternative="two-sided") + return pval + elif counts.shape == (2, 3): + return self._fisher_exact(counts.values) + else: + raise ValueError(f'Unsupported counts shape {counts.shape}') def _fisher_exact( self, diff --git a/src/gpsea/analysis/pcats/stats/_test__stats.py b/src/gpsea/analysis/pcats/stats/_test__stats.py index aa2543063..c8db0b164 100644 --- a/src/gpsea/analysis/pcats/stats/_test__stats.py +++ b/src/gpsea/analysis/pcats/stats/_test__stats.py @@ -2,14 +2,14 @@ import pandas as pd import pytest -from ._stats import PythonMultiFisherExact +from ._stats import FisherExactTest class TestPythonMultiFisherExact: @pytest.fixture - def fisher_exact(self) -> PythonMultiFisherExact: - return PythonMultiFisherExact() + def fisher_exact(self) -> FisherExactTest: + return FisherExactTest() @pytest.mark.parametrize( "counts, expected", @@ -24,10 +24,9 @@ def test_compute_pval( self, counts, expected, - fisher_exact: PythonMultiFisherExact, + fisher_exact: FisherExactTest, ): contingency_matrix = pd.DataFrame(np.array(counts, dtype=np.int64)) final_pval = fisher_exact.compute_pval(contingency_matrix) assert final_pval == pytest.approx(expected) - diff --git a/tests/analysis/pcats/test_disease.py b/tests/analysis/pcats/test_disease.py index 48acece3e..9e16b3002 100644 --- a/tests/analysis/pcats/test_disease.py +++ b/tests/analysis/pcats/test_disease.py @@ -4,7 +4,7 @@ from gpsea.model import Cohort from gpsea.analysis.pcats import DiseaseAnalysis -from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact +from gpsea.analysis.pcats.stats import CountStatistic, FisherExactTest from gpsea.analysis.predicate.genotype import GenotypePolyPredicate from gpsea.analysis.predicate.phenotype import DiseasePresencePredicate @@ -13,7 +13,7 @@ class TestDiseaseAnalysis: @pytest.fixture(scope='class') def count_statistic(self) -> CountStatistic: - return ScipyFisherExact() + return FisherExactTest() @pytest.fixture(scope='class') def suox_disease(self) -> DiseasePresencePredicate: diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py index 516bf9fc5..690e9dd14 100644 --- a/tests/analysis/pcats/test_hpo_term_analysis.py +++ b/tests/analysis/pcats/test_hpo_term_analysis.py @@ -7,7 +7,7 @@ from gpsea.analysis.mtc_filter import PhenotypeMtcFilter, HpoMtcFilter from gpsea.analysis.pcats import HpoTermAnalysis -from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact +from gpsea.analysis.pcats.stats import CountStatistic, FisherExactTest from gpsea.analysis.predicate.genotype import GenotypePolyPredicate from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate @@ -16,7 +16,7 @@ class TestHpoTermAnalysis: @pytest.fixture(scope='class') def count_statistic(self) -> CountStatistic: - return ScipyFisherExact() + return FisherExactTest() @pytest.fixture(scope='class') def phenotype_mtc_filter( diff --git a/tests/test_tutorial.py b/tests/test_tutorial.py index 52924eff4..b592f1f85 100644 --- a/tests/test_tutorial.py +++ b/tests/test_tutorial.py @@ -6,7 +6,7 @@ from gpsea.preprocessing import configure_caching_cohort_creator, CohortCreator, load_phenopackets from gpsea.analysis.pcats import HpoTermAnalysis from gpsea.analysis.mtc_filter import HpoMtcFilter -from gpsea.analysis.pcats.stats import ScipyFisherExact +from gpsea.analysis.pcats.stats import FisherExactTest from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest @@ -53,7 +53,7 @@ def hpo_term_analysis( mtc_filter, ) -> HpoTermAnalysis: return HpoTermAnalysis( - count_statistic=ScipyFisherExact(), + count_statistic=FisherExactTest(), mtc_filter=mtc_filter, mtc_correction='fdr_bh', mtc_alpha=0.05, From ae2e8d00166e292d2c0d53a6fc3a9b5c41db1619 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 2 Sep 2024 10:03:10 +0200 Subject: [PATCH 09/16] Add docs to `ModeOfInheritancePredicate`. --- .../predicate/genotype/_gt_predicates.py | 49 ++++++++++++++++--- .../analysis/predicate/genotype/_variant.py | 2 +- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index a032aae95..23fc4a5df 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -458,52 +458,89 @@ def __repr__(self) -> str: class ModeOfInheritancePredicate(GenotypePolyPredicate): + """ + `ModeOfInheritancePredicate` assigns an individual into a group based on compatibility with + the selected mode of inheritance. + """ @staticmethod def autosomal_dominant( variant_predicate: VariantPredicate, ) -> "ModeOfInheritancePredicate": + """ + Create a predicate that assigns the patient either + into homozygous reference or heterozygous + group in line with the autosomal dominant mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing. + """ return ModeOfInheritancePredicate.from_moi_info( variant_predicate=variant_predicate, - mode_of_inheritance_info=ModeOfInheritanceInfo.autosomal_dominant(), + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), ) @staticmethod def autosomal_recessive( variant_predicate: VariantPredicate, ) -> "ModeOfInheritancePredicate": + """ + Create a predicate that assigns the patient either into + homozygous reference, heterozygous, or biallelic alternative allele + (homozygous alternative or compound heterozygous) + group in line with the autosomal recessive mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing. + """ return ModeOfInheritancePredicate.from_moi_info( variant_predicate=variant_predicate, - mode_of_inheritance_info=ModeOfInheritanceInfo.autosomal_recessive(), + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), ) @staticmethod def x_dominant( variant_predicate: VariantPredicate, ) -> "ModeOfInheritancePredicate": + """ + Create a predicate that assigns the patient either into + homozygous reference or heterozygous + group in line with the X-linked dominant mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing. + """ return ModeOfInheritancePredicate.from_moi_info( variant_predicate=variant_predicate, - mode_of_inheritance_info=ModeOfInheritanceInfo.x_dominant(), + mode_of_inheritance_data=ModeOfInheritanceInfo.x_dominant(), ) @staticmethod def x_recessive( variant_predicate: VariantPredicate, ) -> "ModeOfInheritancePredicate": + """ + Create a predicate that assigns the patient either into + homozygous reference, heterozygous, biallelic alternative allele + (homozygous alternative or compound heterozygous), or hemizygous + group in line with the X-linked recessive mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing. + """ return ModeOfInheritancePredicate.from_moi_info( variant_predicate=variant_predicate, - mode_of_inheritance_info=ModeOfInheritanceInfo.x_recessive(), + mode_of_inheritance_data=ModeOfInheritanceInfo.x_recessive(), ) @staticmethod def from_moi_info( variant_predicate: VariantPredicate, - mode_of_inheritance_info: ModeOfInheritanceInfo, + mode_of_inheritance_data: ModeOfInheritanceInfo, ) -> "ModeOfInheritancePredicate": + """ + Create a predicate for specified mode of inheritance data. + """ allele_counter = AlleleCounter(predicate=variant_predicate) return ModeOfInheritancePredicate( allele_counter=allele_counter, - mode_of_inheritance_info=mode_of_inheritance_info, + mode_of_inheritance_info=mode_of_inheritance_data, ) def __init__( diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py index aeaf6e1df..54d397352 100644 --- a/src/gpsea/analysis/predicate/genotype/_variant.py +++ b/src/gpsea/analysis/predicate/genotype/_variant.py @@ -313,7 +313,7 @@ def is_structural_deletion( to determine if the length of the variant is above or below `threshold`. **IMPORTANT**: the change lengths of deletions are *negative*, since the alternate allele - is shorter than the reference allele. See the method's documentation for more info. + is shorter than the reference allele. See :ref:`change-length-of-an-allele` for more info. **Example** From 304df63f25dd4aac4d66c87d63913fe7371cae97 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 2 Sep 2024 10:13:44 +0200 Subject: [PATCH 10/16] Fix compatibility checking bug. --- src/gpsea/analysis/pcats/_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index 6600dab46..3448ba93b 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -331,7 +331,7 @@ def _check_compatibility( pheno_failed = [] for i, ph_predicate in enumerate(pheno_predicates): - if ph_predicate.n_categorizations not in pheno_accepted: + if ph_predicate.n_categorizations() not in pheno_accepted: pheno_failed.append(i) if len(pheno_failed) != 0: issues.append( @@ -350,7 +350,7 @@ def _check_compatibility( else: raise ValueError(f'Cannot use a count statistic that supports shape {pheno, geno}') - if gt_predicate.n_categorizations not in geno_accepted: + if gt_predicate.n_categorizations() not in geno_accepted: issues.append('Genotype predicate is incompatible with the count statistic') return issues From ade5d5c71913d83fb9721c65cec7511ef60bf21a Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 3 Sep 2024 11:29:08 +0200 Subject: [PATCH 11/16] Remove the obsolete `PhenotypeMtcFilter.filter_terms_to_test` method. --- src/gpsea/analysis/mtc_filter/_impl.py | 127 ----------------- src/gpsea/analysis/pcats/__init__.py | 2 + src/gpsea/analysis/pcats/_impl.py | 2 + .../analysis/pcats/test_hpo_term_analysis.py | 56 +++++++- tests/analysis/test_analysis.py | 26 +--- tests/analysis/test_mtc_filter.py | 130 ++++++++++-------- 6 files changed, 124 insertions(+), 219 deletions(-) diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py index 0aca8cc7d..82a044c13 100644 --- a/src/gpsea/analysis/mtc_filter/_impl.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -90,36 +90,6 @@ def filter( """ pass - @abc.abstractmethod - def filter_terms_to_test( - self, - gt_predicate: GenotypePolyPredicate, - n_usable: typing.Mapping[hpotk.TermId, int], - all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame], - ) -> typing.Tuple[ - typing.Mapping[hpotk.TermId, int], - typing.Mapping[hpotk.TermId, pd.DataFrame], - typing.Mapping[str, int], - ]: - """ - Decide which terms to pass through for statistical testing. - The intention of this class is to reduce multiple testing burden by removing terms that are unlikely - to lead to interesting statistical/analytical results. - - Args: - gt_predicate: the predicate used to bin patients into groups along the genotype axis - n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients - that could be binned according to the used genotype/phenotype predicate. - all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients - in the i-th phenotype (rows) and j-th genotype (column) group - Returns: - a tuple with three items: - - a mapping from :class:`hpotk.TermId` -> - - a mapping from :class:`hpotk.TermId` -> - - a mapping from a `str` with reason why a term was filtered out (e.g. *Skipping top level term*) - """ - pass - @abc.abstractmethod def filter_method_name(self) -> str: """ @@ -453,103 +423,6 @@ def filter( return tuple(results) - def filter_terms_to_test( - self, - gt_predicate: GenotypePolyPredicate, - n_usable: typing.Mapping[hpotk.TermId, int], - all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame], - ) -> typing.Tuple[ - typing.Mapping[hpotk.TermId, int], - typing.Mapping[hpotk.TermId, pd.DataFrame], - typing.Mapping[str, int], - ]: - """ - Args: - gt_predicate: the predicate used to bin patients into groups along the genotype axis - n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients - that could be binned according to the used genotype/phenotype predicate. - all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients - in the i-th phenotype (rows) and j-th genotype (column) group - """ - filtered_n_usable = {} - filtered_all_counts = {} - reason_for_filtering_out = defaultdict(int) - tested_counts_pf = defaultdict( - pd.DataFrame - ) # key is an HP id, value is a tuple with counts, i.e., - - # iterate through the terms in the sorted order, starting from the leaves of the induced graph. - categories = tuple(gt_predicate.get_categories()) - for term_id in self._get_ordered_terms(all_counts.keys()): - if term_id in self._general_hpo_terms: - reason_for_filtering_out["Skipping general term"] += 1 - continue - - if not self._hpo.graph.is_ancestor_of( - hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id - ): - reason_for_filtering_out["Skipping non phenotype term"] += 1 - continue - - # get total number of observations - if term_id not in all_counts: - reason_for_filtering_out["Skipping non-target term"] += 1 - continue - - counts_frame = all_counts[term_id] - total = counts_frame.sum().sum() - max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( - counts_frame - ) - if max_freq < self._hpo_term_frequency_filter: - reason = ( - "Skipping term with maximum frequency " - f"that was less than threshold {self._hpo_term_frequency_filter}" - ) - reason_for_filtering_out[reason] += 1 - continue - - if counts_frame.shape == (2, 2) and total < 7: - reason = f"Skipping term with only {total} observations (not powered for 2x2)" - reason_for_filtering_out[reason] += 1 - continue - - # todo -- similar for (3,2) - if not HpoMtcFilter.some_cell_has_greater_than_one_count(counts_frame): - reason = "Skipping term because no genotype has more than one observed HPO count" - reason_for_filtering_out[reason] += 1 - continue - - elif HpoMtcFilter.genotypes_have_same_hpo_proportions( - counts_frame, - categories, - ): - reason = "Skipping term because all genotypes have same HPO observed proportions" - reason_for_filtering_out[reason] += 1 - continue - - elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( - counts_frame, - categories, - ): - reason = "Skipping term because one genotype had zero observations" - reason_for_filtering_out[reason] += 1 - continue - - for child_term_id in self._hpo.graph.get_children(term_id): - if child_term_id in tested_counts_pf: - if tested_counts_pf[child_term_id].equals(counts_frame): - # TODO: should we make the match fuzzier by adding a tolerance instead of the exact matches? - reason = "Child term with same counts previously tested" - reason_for_filtering_out[reason] += 1 - continue - - # if we get here, then we include the test for `term_id` - filtered_n_usable[term_id] = n_usable[term_id] - filtered_all_counts[term_id] = all_counts[term_id] - - return filtered_n_usable, filtered_all_counts, reason_for_filtering_out - def filter_method_name(self) -> str: return "HPO MTC filter" diff --git a/src/gpsea/analysis/pcats/__init__.py b/src/gpsea/analysis/pcats/__init__.py index 6a82d92f1..59eceb977 100644 --- a/src/gpsea/analysis/pcats/__init__.py +++ b/src/gpsea/analysis/pcats/__init__.py @@ -21,9 +21,11 @@ from ._impl import MultiPhenotypeAnalysis, MultiPhenotypeAnalysisResult from ._impl import DiseaseAnalysis from ._impl import HpoTermAnalysis, HpoTermAnalysisResult +from ._impl import apply_predicates_on_patients __all__ = [ 'MultiPhenotypeAnalysis', 'MultiPhenotypeAnalysisResult', 'DiseaseAnalysis', 'HpoTermAnalysis', 'HpoTermAnalysisResult', + 'apply_predicates_on_patients', ] diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index 3448ba93b..43d8c7077 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -606,6 +606,8 @@ def _compute_result( counts=all_counts, ) mtc_mask = np.array([r.is_passed() for r in mtc_filter_results]) + if not mtc_mask.any(): + raise ValueError("No phenotypes are left for the analysis after MTC filtering step") # 3 - Compute nominal p values pvals = np.full(shape=(len(n_usable),), fill_value=np.nan) diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py index 690e9dd14..dbd8d508f 100644 --- a/tests/analysis/pcats/test_hpo_term_analysis.py +++ b/tests/analysis/pcats/test_hpo_term_analysis.py @@ -14,18 +14,18 @@ class TestHpoTermAnalysis: - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def count_statistic(self) -> CountStatistic: return FisherExactTest() - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def phenotype_mtc_filter( self, hpo: hpotk.MinimalOntology, ) -> PhenotypeMtcFilter: return HpoMtcFilter.default_filter( hpo=hpo, - term_frequency_threshold=.2, + term_frequency_threshold=0.2, ) @pytest.fixture @@ -37,8 +37,8 @@ def analysis( return HpoTermAnalysis( count_statistic=count_statistic, mtc_filter=phenotype_mtc_filter, - mtc_correction='fdr_bh', - mtc_alpha=.05, + mtc_correction="fdr_bh", + mtc_alpha=0.05, ) def test_compare_genotype_vs_phenotypes( @@ -46,7 +46,7 @@ def test_compare_genotype_vs_phenotypes( analysis: HpoTermAnalysis, suox_cohort: Cohort, suox_gt_predicate: GenotypePolyPredicate, - suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]] + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], ): results = analysis.compare_genotype_vs_phenotypes( cohort=suox_cohort.all_patients, @@ -55,4 +55,46 @@ def test_compare_genotype_vs_phenotypes( ) assert results is not None - # TODO: improve testing + + assert results.total_tests == 4 + assert results.n_usable == (35, 18, 13, 25, 23) + assert results.pvals == pytest.approx( + [ + 0.0721291631224236, + 1.0, + float("nan"), + 0.35921595820909313, + 0.6668461434917216, + ], + nan_ok=True, + ) + assert results.corrected_pvals is not None + assert results.corrected_pvals == pytest.approx( + [ + 0.2885166524896944, + 1.0, + float("nan"), + 0.7184319164181863, + 0.8891281913222954, + ], + nan_ok=True, + ) + + def test_compare_genotype_vs_phenotypes_explodes_if_no_phenotypes_are_left_after_mtc_filter( + self, + analysis: HpoTermAnalysis, + degenerated_cohort: Cohort, + suox_gt_predicate: GenotypePolyPredicate, + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + ): + with pytest.raises(ValueError) as e: + analysis.compare_genotype_vs_phenotypes( + cohort=degenerated_cohort, + gt_predicate=suox_gt_predicate, + pheno_predicates=suox_pheno_predicates, + ) + + assert ( + e.value.args[0] + == "No phenotypes are left for the analysis after MTC filtering step" + ) diff --git a/tests/analysis/test_analysis.py b/tests/analysis/test_analysis.py index 5db246218..ea5408e28 100644 --- a/tests/analysis/test_analysis.py +++ b/tests/analysis/test_analysis.py @@ -8,10 +8,11 @@ from gpsea.analysis import ( apply_predicates_on_patients, - CohortAnalysis, CohortAnalysisConfiguration, + CohortAnalysis, configure_cohort_analysis, ) +from gpsea.analysis.mtc_filter._impl import HpoMtcFilter from gpsea.model import * from gpsea.model.genome import * from gpsea.analysis.predicate import PatientCategories @@ -69,26 +70,3 @@ def test_analysis_passes_if_variant_predicate_always_returns_false( predicate=always_false_variant_predicate ) assert results is not None - - def test_analysis_explodes_if_no_phenotypes_are_left_for_analysis( - self, - hpo: hpotk.MinimalOntology, - degenerated_cohort: Cohort, - tmp_path: pathlib.Path, - always_false_variant_predicate: VariantPredicate, - ): - config = CohortAnalysisConfiguration() - config.hpo_mtc_strategy() - cohort_analysis = configure_cohort_analysis( - hpo=hpo, - cohort=degenerated_cohort, - cache_dir=str(tmp_path), - config=config, - ) - - with pytest.raises(ValueError) as e: - _ = cohort_analysis.compare_hpo_vs_genotype( - predicate=always_false_variant_predicate, - ) - - assert e.value.args[0] == 'No phenotypes are left for the analysis after MTC filtering step' diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index c07600ef2..af6f14392 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -1,3 +1,4 @@ +from itertools import count import typing import hpotk @@ -5,11 +6,11 @@ import pandas as pd import pytest -from gpsea.analysis import apply_predicates_on_patients from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter from gpsea.analysis.predicate import PatientCategories from gpsea.analysis.predicate.genotype import GenotypePolyPredicate from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate +from gpsea.analysis.pcats import apply_predicates_on_patients from gpsea.model import Cohort @@ -31,16 +32,13 @@ def patient_counts( suox_cohort: Cohort, suox_gt_predicate: GenotypePolyPredicate, suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], - ) -> typing.Tuple[ - typing.Mapping[hpotk.TermId, int], - typing.Mapping[hpotk.TermId, pd.DataFrame], - ]: - categories, n_usable, all_counts = apply_predicates_on_patients( + ) -> typing.Sequence[pd.DataFrame]: + _, counts = apply_predicates_on_patients( patients=suox_cohort.all_patients, gt_predicate=suox_gt_predicate, pheno_predicates=suox_pheno_predicates, ) - return n_usable, all_counts + return counts @pytest.fixture(scope='class') def gt_categories(self) -> pd.Index: @@ -135,74 +133,80 @@ def test_genotypes_have_same_hpo_proportions( assert actual == expected def test_filter_terms_to_test( - self, - mtc_filter: HpoMtcFilter, - suox_gt_predicate: GenotypePolyPredicate, - patient_counts: typing.Tuple[ - typing.Mapping[hpotk.TermId, int], - typing.Mapping[hpotk.TermId, pd.DataFrame], - ], + self, + mtc_filter: HpoMtcFilter, + suox_gt_predicate: GenotypePolyPredicate, + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + patient_counts: typing.Sequence[pd.DataFrame], ): - n_usable, all_counts = patient_counts - mtc_report = mtc_filter.filter_terms_to_test( - suox_gt_predicate, - n_usable, - all_counts, + phenotypes = [p.phenotype for p in suox_pheno_predicates] + + mtc_report = mtc_filter.filter( + gt_predicate=suox_gt_predicate, + phenotypes=phenotypes, + counts=patient_counts, ) - assert isinstance(mtc_report, tuple) - assert len(mtc_report) == 3 - - filtered_n_usable, filtered_all_counts, reason_for_filtering_out = mtc_report + assert isinstance(mtc_report, typing.Sequence) + assert len(mtc_report) == 5 - assert reason_for_filtering_out['Skipping term because all genotypes have same HPO observed proportions'] == 1 - assert reason_for_filtering_out['Skipping general term'] == 16 - assert reason_for_filtering_out['Skipping non-target term'] == 5 - assert reason_for_filtering_out['Skipping top level term'] == 0 + is_ok = [r.is_passed() for r in mtc_report] + assert is_ok == [True, True, False, True, True] - assert len(filtered_n_usable) == 4 - assert len(filtered_all_counts) == 4 + reasons = [r.reason for r in mtc_report] + assert reasons == [ + None, None, + 'Skipping term because all genotypes have same HPO observed proportions', + None, None, + ] def test_specified_term_mtc_filter( - self, - suox_gt_predicate: GenotypePolyPredicate, - patient_counts: typing.Tuple[ - typing.Mapping[hpotk.TermId, int], - typing.Mapping[hpotk.TermId, pd.DataFrame], - ], + self, + suox_gt_predicate: GenotypePolyPredicate, + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + patient_counts: typing.Sequence[pd.DataFrame], ): """ The point of this test is to check that if we filter to test only one term ("HP:0032350"), then this is the only term that should survive the filter. We start with a total of five terms (n_usable==5), - but after our filter, only one survives (filtered_n_usable == 1), and we have four cases in which the - reason for filtering out is 'Skipping non-specified term' + but after our filter, only one survives, and we have four cases in which the + reason for filtering out is 'Non-specified term' """ - specified_filter = SpecifiedTermsMtcFilter(terms_to_test={hpotk.TermId.from_curie("HP:0032350")}) - n_usable, all_counts = patient_counts - mtc_report = specified_filter.filter_terms_to_test( - suox_gt_predicate, - n_usable, - all_counts, + specified_filter = SpecifiedTermsMtcFilter( + terms_to_test=(hpotk.TermId.from_curie("HP:0032350"),), + ) + phenotypes = [p.phenotype for p in suox_pheno_predicates] + + mtc_report = specified_filter.filter( + gt_predicate=suox_gt_predicate, + phenotypes=phenotypes, + counts=patient_counts, ) - assert isinstance(mtc_report, tuple) - assert len(mtc_report) == 3 # # Skipping non-specified term (n=5) + assert isinstance(mtc_report, typing.Sequence) + assert len(mtc_report) == 5 + + is_passed = [r.is_passed() for r in mtc_report] + assert is_passed == [ + False, False, True, False, False, + ] - filtered_n_usable, filtered_all_counts, reason_for_filtering_out = mtc_report - assert len(n_usable) == 5 - assert len(filtered_n_usable) == 1 - assert reason_for_filtering_out['Skipping non-specified term'] == 4 + reasons = [r.reason for r in mtc_report] + assert reasons == [ + 'Non-specified term', + 'Non-specified term', + None, + 'Non-specified term', + 'Non-specified term', + ] def test_min_observed_HPO_threshold( self, - patient_counts: typing.Tuple[ - typing.Mapping[hpotk.TermId, int], - typing.Mapping[hpotk.TermId, pd.DataFrame], - ], + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + patient_counts: typing.Sequence[pd.DataFrame], ): """ In our heuristic filter, we only test terms that have at least a threshold - frequency in at least one of the groups. We use the "all counts" datastructure, that - is a dictionary whose keys are hpotoolkit TermIds and whose values are pandas DataFrames + frequency in at least one of the groups. We use the `patient_counts` - a sequence of DataFrames with 2x2 contingenicy tables of counts. For instance, each column will have one row for PatientCategories.YES and one for PatientCategories.NO, indicating counts of measured observed/excluded HPO phenotypes. Each column is a certain genotype, e.g., MISSENSE or NON-MISSENSE. We want the @@ -211,28 +215,32 @@ def test_min_observed_HPO_threshold( for all of the HPO terms in the dictionary. """ EPSILON = 0.001 - _, all_counts = patient_counts + curie2idx = { + p.phenotype.value: i + for i, p + in enumerate(suox_pheno_predicates) + } # Ectopia lentis HP:0001083 (6 9 1 2), freqs are 6/15=0.4 and 1/3=0.33 - ectopia = all_counts[hpotk.TermId.from_curie("HP:0001083")] + ectopia = patient_counts[curie2idx["HP:0001083"]] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ectopia) assert max_f == pytest.approx(0.4, abs=EPSILON) # Seizure HP:0001250 (17 7 11 0), freqs are 17/24=0.7083 and 11/11=1 - seizure = all_counts[hpotk.TermId.from_curie("HP:0001250")] + seizure = patient_counts[curie2idx["HP:0001250"]] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(seizure) assert max_f == pytest.approx(1.0, abs=EPSILON) # Sulfocysteinuria HP:0032350 (11 0 2 0), freqs are both 1 - sulfocysteinuria = all_counts[hpotk.TermId.from_curie("HP:0032350")] + sulfocysteinuria = patient_counts[curie2idx["HP:0032350"]] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(sulfocysteinuria) assert max_f == pytest.approx(1.0, abs=EPSILON) # Neurodevelopmental delay HP:0012758 (4 13 4 4), freqs are 4/17 = 0.235 and 4/8=0.5 - ndelay = all_counts[hpotk.TermId.from_curie("HP:0012758")] + ndelay = patient_counts[curie2idx["HP:0012758"]] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ndelay) assert max_f == pytest.approx(0.5, abs=EPSILON) # Hypertonia HP:0001276 (7 9 4 3) fresa are 7/16=0.4375 and 4/7=0.5714 - hypertonia = all_counts[hpotk.TermId.from_curie("HP:0001276")] + hypertonia = patient_counts[curie2idx["HP:0001276"]] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(hypertonia) assert max_f == pytest.approx(0.5714, abs=EPSILON) From ab0a116e13baa937ec3a6f57a928cdfd3fa632f3 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 3 Sep 2024 11:56:20 +0200 Subject: [PATCH 12/16] Add `PhenotypePolyPredicate.present_phenotype_categorization` property. --- .../analysis/predicate/phenotype/_pheno.py | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index c2646f727..9a9c496c8 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -60,11 +60,12 @@ class PhenotypePolyPredicate( Each phenotype category `P` can be a :class:`~hpotk.model.TermId` representing an HPO term or an OMIM/MONDO term. - Most of the time, only one category is investigated, and :attr:`phenotype` will return a sequence with - one element only (e.g. *Arachnodactyly* `HP:0001166`). However, multiple categories can be sought as well, - such as when the predicate bins the patient into one of discrete diseases - (e.g. Geleophysic dysplasia, Marfan syndrome, ...). Then the predicate should return a sequence of disease - identifiers. + Only one category can be investigated, and :attr:`phenotype` returns the investigated phenotype + (e.g. *Arachnodactyly* `HP:0001166`). + + As another hallmark of this predicate, one of the categorizations must correspond to the group of patients + who exibit the investigated phenotype. The categorization is provided + via :attr:`present_phenotype_categorization` property. """ @property @@ -75,6 +76,14 @@ def phenotype(self) -> P: """ pass + @property + @abc.abstractmethod + def present_phenotype_categorization(self) -> PhenotypeCategorization[P]: + """ + Get the categorization which represents the group of the patients who exibit the investigated phenotype. + """ + pass + class PropagatingPhenotypePredicate(PhenotypePolyPredicate[hpotk.TermId]): """ @@ -112,6 +121,7 @@ def __init__( category=PatientCategories.NO, phenotype=self._query, ) + self._categorizations = (self._phenotype_observed, self._phenotype_excluded) def get_question(self) -> str: return f"Is {self._query_label} present in the patient?" @@ -120,10 +130,14 @@ def get_question(self) -> str: def phenotype(self) -> hpotk.TermId: return self._query + @property + def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]: + return self._phenotype_observed + def get_categorizations( self, ) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]: - return self._phenotype_observed, self._phenotype_excluded + return self._categorizations def test( self, patient: Patient @@ -207,6 +221,10 @@ def get_question(self) -> str: def phenotype(self) -> hpotk.TermId: return self._query + @property + def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]: + return self._diagnosis_present + def get_categorizations( self, ) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]: From b35790b8a6794428b2eef0d4fe3d830e6e6b8650 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 3 Sep 2024 16:07:11 +0200 Subject: [PATCH 13/16] Improve `PhenotypeMtcFilter` tests. --- docs/report/tbx5_frameshift_vs_missense.csv | 36 ++-- ...bx5_frameshift_vs_missense.mtc_report.html | 21 +- docs/tutorial.rst | 8 +- src/gpsea/analysis/mtc_filter/_impl.py | 202 ++++++------------ src/gpsea/analysis/pcats/_impl.py | 2 +- .../analysis/predicate/phenotype/_pheno.py | 8 + tests/analysis/test_analysis.py | 72 ------- tests/analysis/test_examples.py | 2 + tests/analysis/test_mtc_filter.py | 151 +++++++++---- 9 files changed, 219 insertions(+), 283 deletions(-) delete mode 100644 tests/analysis/test_analysis.py diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 07245a440..0bc1e5c6c 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,22 +1,22 @@ "Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05 -Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214 -Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015 -Heart block [HP:0012722],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015 -Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.018011232501600187,0.005628510156750059 -Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.03598740440845704,0.01349527665317139 -Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.058517997576307476,0.02560162393963452 -Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.14881278039172777,0.07440639019586388 -Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2532484592139712,0.1424522583078588 -Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2699930339748754,0.1687456462342971 -Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6233766233766234,0.42857142857142855 -Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.7619047619047618,0.5714285714285713 -Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.9520604334894502,0.7735491022101784 -Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 +Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05 +Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214 +Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015 +Heart block [HP:0012722],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015 +Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.0191369345329502,0.005628510156750059 +Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.038236617183985605,0.01349527665317139 +Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.062175372424826694,0.02560162393963452 +Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.15811357916621074,0.07440639019586388 +Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2690764879148444,0.1424522583078588 +Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2868675985983051,0.1687456462342971 +Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 +Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 +Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 +Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 -Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,, -Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,, -Abnormality of the upper limb [HP:0002817],73/73,100%,34/34,100%,, -Abnormality of limbs [HP:0040064],73/73,100%,34/34,100%,, +Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 +Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,, +Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,, +Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, diff --git a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html index 5b149af79..b6ac17472 100644 --- a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html +++ b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html @@ -48,9 +48,9 @@

Phenotype testing report

Phenotype MTC filter: HPO MTC filter

Multiple testing correction: fdr_bh

-

Performed statistical tests for 16 out of the total of 260 HPO terms.

+

Performed statistical tests for 17 out of the total of 260 HPO terms.

- + @@ -61,21 +61,21 @@

Phenotype testing report

- - + + - - + + - + @@ -114,13 +114,6 @@

Phenotype testing report

- - - - - - - diff --git a/docs/tutorial.rst b/docs/tutorial.rst index e047943ce..29d8f2d9a 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -246,15 +246,15 @@ Now we can perform the analysis and investigate the results. ... pheno_predicates=pheno_predicates, ... ) >>> result.total_tests -16 +17 -We only tested 16 HPO terms. This is despite the individuals being collectively annotated with +We only tested 1y HPO terms. This is despite the individuals being collectively annotated with 260 direct and indirect HPO terms >>> len(result.phenotypes) 260 -We can show the reasoning behind *not* testing 244 (`260 - 16`) HPO terms +We can show the reasoning behind *not* testing 243 (`260 - 17`) HPO terms by exploring the phenotype MTC filtering report. >>> from gpsea.view import MtcStatsViewer @@ -283,4 +283,4 @@ was observed in 31/60 (52%) patients with a missense variant but it was observed in 19/19 (100%) patients with a frameshift variant. Fisher exact test computed a p value of `~0.0000562` and the p value corrected by Benjamini-Hochberg procedure -is `~0.000899`. +is `~0.000955`. diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py index 82a044c13..c52d10b9c 100644 --- a/src/gpsea/analysis/mtc_filter/_impl.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -1,14 +1,15 @@ import abc import typing -from collections import defaultdict, deque +from collections import deque import hpotk +from matplotlib import axis +import numpy as np import pandas as pd -from ..predicate import PatientCategories, PatientCategory from ..predicate.genotype import GenotypePolyPredicate -from ..predicate.phenotype import P +from ..predicate.phenotype import PhenotypePolyPredicate, P class PhenotypeMtcResult: @@ -76,14 +77,14 @@ class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta): def filter( self, gt_predicate: GenotypePolyPredicate, - phenotypes: typing.Sequence[P], + ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]], counts: typing.Sequence[pd.DataFrame], ) -> typing.Sequence[PhenotypeMtcResult]: """ Test if the phenotype with given counts should be included in the downstream analysis. :param gt_predicate: the predicate that produced the columns of the `count` data frame. - :param phenotypes: the tested phenotypes. + :param ph_predicates: the phenotype predicates that produced the rows of the `counts` data frames. :param counts: a sequence of 2D data frames for the tested phenotypes. Each data frame corrresponds to a genotype/phenotype contingency matrix. :returns: a sequence of filter results for the input `phenotypes`. @@ -111,27 +112,11 @@ def __init__(self): def filter( self, gt_predicate: GenotypePolyPredicate, - phenotypes: typing.Sequence[P], + ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]], counts: typing.Sequence[pd.DataFrame], ) -> typing.Sequence[PhenotypeMtcResult]: # Always OK! 😏 - return tuple(self._ok for _ in phenotypes) - - def filter_terms_to_test( - self, - gt_predicate: GenotypePolyPredicate, - n_usable: typing.Mapping[hpotk.TermId, int], - all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame], - ) -> typing.Tuple[ - typing.Mapping[hpotk.TermId, int], - typing.Mapping[hpotk.TermId, pd.DataFrame], - typing.Mapping[str, int], - ]: - """ - Use this implementation to test all available HPO terms. - No HPO terms will be filtered out! - """ - return n_usable, all_counts, {} + return tuple(self._ok for _ in ph_predicates) def filter_method_name(self) -> str: return "All HPO terms" @@ -164,55 +149,17 @@ def __init__( def filter( self, gt_predicate: GenotypePolyPredicate, - phenotypes: typing.Sequence[P], + ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]], counts: typing.Sequence[pd.DataFrame], ) -> typing.Sequence[PhenotypeMtcResult]: results = [] - for phenotype in phenotypes: - if phenotype in self._terms_to_test_set: + for predicate in ph_predicates: + if predicate.phenotype in self._terms_to_test_set: results.append(self._ok) else: results.append(self._fail) return tuple(results) - def filter_terms_to_test( - self, - gt_predicate: GenotypePolyPredicate, - n_usable: typing.Mapping[hpotk.TermId, int], - all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame], - ) -> typing.Tuple[ - typing.Mapping[hpotk.TermId, int], - typing.Mapping[hpotk.TermId, pd.DataFrame], - typing.Mapping[str, int], - ]: - """ - Remove terms that are not members of the specific set of HPO terms to be tested. - - Args: - gt_predicate: the predicate used to bin patients into groups along the genotype axis - n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients - that could be binned according to the used genotype/phenotype predicate. - all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients - in the i-th phenotype (rows) and j-th genotype (column) group - - Returns: - filtered versions of the two dictionaries above and dataframe with reasons for skipping - """ - filtered_n_usable = {} - filtered_all_counts = {} - reason_for_filtering_out: typing.DefaultDict[str, int] = defaultdict(int) - - for term_id in all_counts.keys(): - if term_id not in self._terms_to_test_set: - reason_for_filtering_out["Skipping non-specified term"] += 1 - continue - - # if we get here, then the term is a member of our list of terms to be tested. - filtered_n_usable[term_id] = n_usable[term_id] - filtered_all_counts[term_id] = all_counts[term_id] - - return filtered_n_usable, filtered_all_counts, reason_for_filtering_out - def filter_method_name(self) -> str: return "Specified terms MTC filter" @@ -326,13 +273,13 @@ def __init__( def filter( self, gt_predicate: GenotypePolyPredicate, - phenotypes: typing.Sequence[hpotk.TermId], + ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]], counts: typing.Sequence[pd.DataFrame], ) -> typing.Sequence[PhenotypeMtcResult]: + phenotypes = [p.phenotype for p in ph_predicates] p_to_idx = {p: i for i, p in enumerate(phenotypes)} - results = [None for _ in range(len(phenotypes))] - categories = tuple(gt_predicate.get_categories()) + results: typing.MutableSequence[typing.Optional[PhenotypeMtcResult]] = [None for _ in range(len(phenotypes))] for term_id in self._get_ordered_terms(phenotypes): try: idx = p_to_idx[term_id] @@ -357,10 +304,12 @@ def filter( # reason_for_filtering_out["Skipping non-target term"] += 1 # continue + ph_predicate = ph_predicates[idx] contingency_matrix = counts[idx] total = contingency_matrix.sum().sum() max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( - contingency_matrix + contingency_matrix, + ph_predicate=ph_predicate, ) if max_freq < self._hpo_term_frequency_filter: reason = ( @@ -375,23 +324,26 @@ def filter( results[idx] = PhenotypeMtcResult.fail(reason) continue - # todo -- similar for (3,2) - if not HpoMtcFilter.some_cell_has_greater_than_one_count(contingency_matrix): + if not HpoMtcFilter.some_cell_has_greater_than_one_count( + counts=contingency_matrix, + ph_predicate=ph_predicate, + ): reason = "Skipping term because no genotype has more than one observed HPO count" results[idx] = PhenotypeMtcResult.fail(reason) continue - + elif HpoMtcFilter.genotypes_have_same_hpo_proportions( contingency_matrix, - categories, + gt_predicate=gt_predicate, + ph_predicate=ph_predicate, ): reason = "Skipping term because all genotypes have same HPO observed proportions" results[idx] = PhenotypeMtcResult.fail(reason) continue elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( - contingency_matrix, - categories, + counts=contingency_matrix, + gt_predicate=gt_predicate, ): reason = "Skipping term because one genotype had zero observations" results[idx] = PhenotypeMtcResult.fail(reason) @@ -417,78 +369,70 @@ def filter( term_name = self._hpo.get_term_name(phenotypes[i]) missing.append(term_name) - print('BLAAAAAA') msg = 'Missing results for {}'.format(', '.join(missing)) raise ValueError(msg) - return tuple(results) + # Ignoring the type hint, because we checked the type match above. + return tuple(results) # type: ignore def filter_method_name(self) -> str: return "HPO MTC filter" @staticmethod - def get_number_of_observed_hpo_observations(counts_frame: pd.DataFrame) -> int: - return counts_frame.loc[PatientCategories.YES].sum() + def get_number_of_observed_hpo_observations( + counts_frame: pd.DataFrame, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + ) -> int: + return counts_frame.loc[ph_predicate.present_phenotype_category].sum() @staticmethod - def get_maximum_group_observed_HPO_frequency(counts_frame: pd.DataFrame) -> float: + def get_maximum_group_observed_HPO_frequency( + counts_frame: pd.DataFrame, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + ) -> float: """ Returns: The maximum frequency of observed HPO annotations across all genotypes. """ - df = counts_frame.loc[PatientCategories.YES] / ( - counts_frame.loc[PatientCategories.YES] - + counts_frame.loc[PatientCategories.NO] - ) - return df.max() + all_hpo_count_per_gt = counts_frame.sum() + if (all_hpo_count_per_gt == 0).all(): + # Prevent division by zeros + return 0. + + present_hpo_count_per_gt = counts_frame.loc[ph_predicate.present_phenotype_category] + return (present_hpo_count_per_gt / all_hpo_count_per_gt).max() @staticmethod def one_genotype_has_zero_hpo_observations( counts: pd.DataFrame, - gt_categories: typing.Sequence[PatientCategory], + gt_predicate: GenotypePolyPredicate, ): - if not isinstance(counts, pd.DataFrame): - raise ValueError( - f"argument 'counts' must be pandas DataFrame but was {type(counts)}" - ) - - if counts.shape == (2, 2): - assert len(gt_categories) == 2, \ - f"The counts frame is 2x2 but we found {len(gt_categories)} patient categories!" - a, b = gt_categories - return ( - counts.loc[:, a].sum() == 0 or counts.loc[:, b].sum() == 0 - ) - elif counts.shape == (2, 3): - raise ValueError("(2,3) not yet implemented") - else: - raise ValueError( - f"Did not recognize shape of counts matrix: {counts.shape}" - ) + return any(counts.loc[:, c].sum() == 0 for c in gt_predicate.get_categories()) @staticmethod - def some_cell_has_greater_than_one_count(counts: pd.DataFrame) -> bool: + def some_cell_has_greater_than_one_count( + counts: pd.DataFrame, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + ) -> bool: """ If no genotype has more than one HPO count, we do not want to do a test. For instance, if MISSENSE has one observed HPO and N excluded, and NOT MISSENSE has zero or one observed HPO, then we will skip the test + Args: counts: pandas DataFrame with counts + ph_predicate: the phenotype predicate that produced the counts Returns: true if at least one of the genotypes has more than one observed HPO count """ - if not isinstance(counts, pd.DataFrame): - raise ValueError( - f"argument 'counts' must be pandas DataFrame but was {type(counts)}" - ) - - return (counts.loc[PatientCategories.YES] > 1).any() + return (counts.loc[ph_predicate.present_phenotype_category] > 1).any() @staticmethod def genotypes_have_same_hpo_proportions( counts: pd.DataFrame, - gt_categories: typing.Sequence[PatientCategory], - delta: float = 0.01, + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + delta: float = 5e-4, ) -> bool: """ If each genotype has the same proportion of observed HPOs, then we do not want to do a test. @@ -500,28 +444,20 @@ def genotypes_have_same_hpo_proportions( Returns: true if the genotypes differ by more than `delta` """ - if not isinstance(counts, pd.DataFrame): - raise ValueError( - f"argument 'counts' must be pandas DataFrame but was {type(counts)}" - ) + numerators = np.array([ + counts.loc[ph_predicate.present_phenotype_category, c] for c in gt_predicate.get_categories() + ]) + denominators = np.array([ + counts.loc[:, c].sum() for c in gt_predicate.get_categories() + ]) + + if np.any(denominators == 0): + return False - if counts.shape == (2, 2): - assert len(gt_categories) == 2, \ - f"The counts frame is 2x2 but we found {len(gt_categories)} patient categories!" - a, b = gt_categories - num1 = counts.loc[PatientCategories.YES, a] - denom1 = counts.loc[:, a].sum() - num2 = counts.loc[PatientCategories.YES, b] - denom2 = counts.loc[:, b].sum() - if denom1 == 0 or denom2 == 0: - return False - return abs(num1 / denom1 - num2 / denom2) < delta - elif counts.shape == (2, 3): - raise ValueError("(2,3) not implemented yet") - else: - raise ValueError( - f"Did not recognize shape of counts matrix: {counts.shape}" - ) + ratios = numerators / denominators + mean_ratio = np.mean(ratios) + abs_diff = np.abs(ratios - mean_ratio) + return bool(np.all(abs_diff < delta)) def _get_ordered_terms( self, diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index 43d8c7077..408a9fd9c 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -602,7 +602,7 @@ def _compute_result( # 2 - Apply MTC filter and select p values to MTC mtc_filter_results = self._mtc_filter.filter( gt_predicate=gt_predicate, - phenotypes=phenotypes, + ph_predicates=pheno_predicates, counts=all_counts, ) mtc_mask = np.array([r.is_passed() for r in mtc_filter_results]) diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 9a9c496c8..ecfb14018 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -84,6 +84,13 @@ def present_phenotype_categorization(self) -> PhenotypeCategorization[P]: """ pass + @property + def present_phenotype_category(self) -> PatientCategory: + """ + Get the patient category that correspond to the group of the patients who exibit the investigated phenotype. + """ + return self.present_phenotype_categorization.category + class PropagatingPhenotypePredicate(PhenotypePolyPredicate[hpotk.TermId]): """ @@ -121,6 +128,7 @@ def __init__( category=PatientCategories.NO, phenotype=self._query, ) + # Some tests depend on the order of `self._categorizations`. self._categorizations = (self._phenotype_observed, self._phenotype_excluded) def get_question(self) -> str: diff --git a/tests/analysis/test_analysis.py b/tests/analysis/test_analysis.py deleted file mode 100644 index ea5408e28..000000000 --- a/tests/analysis/test_analysis.py +++ /dev/null @@ -1,72 +0,0 @@ -import pathlib -import typing - -import hpotk -import numpy as np -import pandas as pd -import pytest - -from gpsea.analysis import ( - apply_predicates_on_patients, - CohortAnalysis, - configure_cohort_analysis, -) - -from gpsea.analysis.mtc_filter._impl import HpoMtcFilter -from gpsea.model import * -from gpsea.model.genome import * -from gpsea.analysis.predicate import PatientCategories -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate - - -def test_apply_predicates_on_patients( - suox_cohort: Cohort, - suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], - suox_gt_predicate: GenotypePolyPredicate, -): - categories, n_usable, counts = apply_predicates_on_patients( - patients=suox_cohort.all_patients, - pheno_predicates=suox_pheno_predicates, - gt_predicate=suox_gt_predicate, - ) - - assert isinstance(categories, typing.Collection) - assert PatientCategories.YES in categories - assert PatientCategories.NO in categories - assert len(categories) == 2 - - assert isinstance(n_usable, pd.Series) - assert len(n_usable) == 5 - - assert isinstance(counts, typing.Mapping) - assert len(counts) == 5 - - seizure_counts = counts[hpotk.TermId.from_curie('HP:0001250')] - assert np.array_equal(seizure_counts.to_numpy(), np.array([[17, 11], [7, 0]])) - - -class TestCohortAnalysis: - - @pytest.fixture - def cohort_analysis( - self, - suox_cohort: Cohort, - hpo: hpotk.MinimalOntology, - tmp_path: pathlib.Path, - ) -> CohortAnalysis: - return configure_cohort_analysis( - cohort=suox_cohort, - hpo=hpo, - cache_dir=str(tmp_path), - ) - - def test_analysis_passes_if_variant_predicate_always_returns_false( - self, - cohort_analysis: CohortAnalysis, - always_false_variant_predicate: VariantPredicate, - ): - results = cohort_analysis.compare_hpo_vs_genotype( - predicate=always_false_variant_predicate - ) - assert results is not None diff --git a/tests/analysis/test_examples.py b/tests/analysis/test_examples.py index 536f9907c..8f17161d6 100644 --- a/tests/analysis/test_examples.py +++ b/tests/analysis/test_examples.py @@ -11,6 +11,8 @@ from gpsea.model import Cohort, VariantEffect +# TODO: remove at some point! +@pytest.mark.skip('Obsolete tests') class TestCohortAnalysis: def test_compare_by_variant_effect( diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index af6f14392..d904cfe43 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -7,9 +7,8 @@ import pytest from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter -from gpsea.analysis.predicate import PatientCategories from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate from gpsea.analysis.pcats import apply_predicates_on_patients from gpsea.model import Cohort @@ -41,20 +40,36 @@ def patient_counts( return counts @pytest.fixture(scope='class') - def gt_categories(self) -> pd.Index: - return pd.Index([PatientCategories.YES, PatientCategories.NO]) + def gt_predicate( + self, + suox_gt_predicate: GenotypePolyPredicate, + ) -> GenotypePolyPredicate: + return suox_gt_predicate @pytest.fixture(scope='class') - def pheno_categories(self) -> pd.Index: - return pd.Index([PatientCategories.YES, PatientCategories.NO]) + def ph_predicate( + self, + hpo: hpotk.MinimalOntology, + ) -> PhenotypePolyPredicate[hpotk.TermId]: + """ + For the purpose of testing counts, let's pretend the counts + were created by this predicate. + """ + return PropagatingPhenotypePredicate( + hpo=hpo, + query=hpotk.TermId.from_curie("HP:0001250"), # Seizure + missing_implies_phenotype_excluded=False, + ) @staticmethod def prepare_counts_df( counts, - index: pd.Index, - columns: pd.Index, + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], ): values = np.array(counts).reshape((2, 2)) + index = pd.Index(gt_predicate.get_categories()) + columns = pd.Index(ph_predicate.get_categories()) return pd.DataFrame(data=values, index=index, columns=columns) @pytest.mark.parametrize( @@ -67,17 +82,17 @@ def prepare_counts_df( ] ) def test_one_genotype_has_zero_hpo_observations( - self, - counts: typing.Tuple[int], - expected: bool, - gt_categories: pd.Index, - pheno_categories: pd.Index, + self, + counts: typing.Tuple[int], + expected: bool, + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], ): - counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories) + counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate) actual = HpoMtcFilter.one_genotype_has_zero_hpo_observations( counts=counts_df, - gt_categories=gt_categories, + gt_predicate=gt_predicate, ) assert actual == expected @@ -99,12 +114,15 @@ def test_some_cell_has_greater_than_one_count( self, counts: typing.Tuple[int], expected: bool, - gt_categories: pd.Index, - pheno_categories: pd.Index, + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], ): - counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories) + counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate) - actual = HpoMtcFilter.some_cell_has_greater_than_one_count(counts=counts_df) + actual = HpoMtcFilter.some_cell_has_greater_than_one_count( + counts=counts_df, + ph_predicate=ph_predicate, + ) assert actual == expected @@ -112,26 +130,55 @@ def test_some_cell_has_greater_than_one_count( "counts, expected", [ ((10, 100, 50, 500), True), + ((0, 0, 100, 100), True), + ((10, 100, 50, 500), True), + ((10, 100, 10, 105), False), ((95, 60, 144 - 95, 71 - 60), False), ((40, 15, 18, 15), False), ] ) def test_genotypes_have_same_hpo_proportions( - self, - counts: typing.Tuple[int], - expected: bool, - gt_categories: pd.Index, - pheno_categories: pd.Index, + self, + counts: typing.Tuple[int], + expected: bool, + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], ): - counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories) + counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate) actual = HpoMtcFilter.genotypes_have_same_hpo_proportions( counts=counts_df, - gt_categories=gt_categories, + gt_predicate=gt_predicate, + ph_predicate=ph_predicate, ) assert actual == expected + @pytest.mark.parametrize( + "counts, expected", + [ + ((1, 2, 99, 198), .01), + ((1, 3, 99, 197), .015), + ((0, 0, 100, 200), 0.), + ((0, 0, 0, 0), 0.), + ] + ) + def test_get_maximum_group_observed_HPO_frequency( + self, + counts: typing.Tuple[int], + expected: float, + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + ): + counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate) + + actual = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( + counts_frame=counts_df, + ph_predicate=ph_predicate, + ) + + assert actual == pytest.approx(expected) + def test_filter_terms_to_test( self, mtc_filter: HpoMtcFilter, @@ -139,11 +186,9 @@ def test_filter_terms_to_test( suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], patient_counts: typing.Sequence[pd.DataFrame], ): - phenotypes = [p.phenotype for p in suox_pheno_predicates] - mtc_report = mtc_filter.filter( gt_predicate=suox_gt_predicate, - phenotypes=phenotypes, + ph_predicates=suox_pheno_predicates, counts=patient_counts, ) @@ -175,11 +220,10 @@ def test_specified_term_mtc_filter( specified_filter = SpecifiedTermsMtcFilter( terms_to_test=(hpotk.TermId.from_curie("HP:0032350"),), ) - phenotypes = [p.phenotype for p in suox_pheno_predicates] mtc_report = specified_filter.filter( gt_predicate=suox_gt_predicate, - phenotypes=phenotypes, + ph_predicates=suox_pheno_predicates, counts=patient_counts, ) assert isinstance(mtc_report, typing.Sequence) @@ -221,26 +265,51 @@ def test_min_observed_HPO_threshold( in enumerate(suox_pheno_predicates) } # Ectopia lentis HP:0001083 (6 9 1 2), freqs are 6/15=0.4 and 1/3=0.33 - ectopia = patient_counts[curie2idx["HP:0001083"]] - max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ectopia) + idx = curie2idx["HP:0001083"] + ectopia = patient_counts[idx] + ectopia_predicate = suox_pheno_predicates[idx] + max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( + ectopia, + ph_predicate=ectopia_predicate, + ) assert max_f == pytest.approx(0.4, abs=EPSILON) # Seizure HP:0001250 (17 7 11 0), freqs are 17/24=0.7083 and 11/11=1 - seizure = patient_counts[curie2idx["HP:0001250"]] - max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(seizure) + idx = curie2idx["HP:0001250"] + seizure = patient_counts[idx] + seizure_predicate = suox_pheno_predicates[idx] + max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( + seizure, + ph_predicate=seizure_predicate + ) assert max_f == pytest.approx(1.0, abs=EPSILON) # Sulfocysteinuria HP:0032350 (11 0 2 0), freqs are both 1 - sulfocysteinuria = patient_counts[curie2idx["HP:0032350"]] - max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(sulfocysteinuria) + idx = curie2idx["HP:0032350"] + sulfocysteinuria = patient_counts[idx] + sulfocysteinuria_predicate = suox_pheno_predicates[idx] + max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( + sulfocysteinuria, + ph_predicate=sulfocysteinuria_predicate, + ) assert max_f == pytest.approx(1.0, abs=EPSILON) # Neurodevelopmental delay HP:0012758 (4 13 4 4), freqs are 4/17 = 0.235 and 4/8=0.5 - ndelay = patient_counts[curie2idx["HP:0012758"]] - max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ndelay) + idx = curie2idx["HP:0012758"] + ndelay = patient_counts[idx] + ndelay_predicate = suox_pheno_predicates[idx] + max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( + ndelay, + ph_predicate=ndelay_predicate, + ) assert max_f == pytest.approx(0.5, abs=EPSILON) # Hypertonia HP:0001276 (7 9 4 3) fresa are 7/16=0.4375 and 4/7=0.5714 - hypertonia = patient_counts[curie2idx["HP:0001276"]] - max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(hypertonia) + idx = curie2idx["HP:0001276"] + hypertonia = patient_counts[idx] + hypertonia_predicate = suox_pheno_predicates[idx] + max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( + hypertonia, + ph_predicate=hypertonia_predicate, + ) assert max_f == pytest.approx(0.5714, abs=EPSILON) From af302c0a61fe5c38c88c893eb834075c1ca16917 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 3 Sep 2024 17:03:59 +0200 Subject: [PATCH 14/16] Implement filtering genotype predicate for choosing the genotype categories of interest. --- src/gpsea/analysis/predicate/genotype/_api.py | 82 ++++++++++++++++++- .../predicate/genotype/test_gt_predicates.py | 82 +++++++++++++++++++ 2 files changed, 162 insertions(+), 2 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py index 39cf020be..a90bb5c88 100644 --- a/src/gpsea/analysis/predicate/genotype/_api.py +++ b/src/gpsea/analysis/predicate/genotype/_api.py @@ -1,7 +1,7 @@ import abc import typing -from gpsea.model import Variant +from gpsea.model import Patient, Variant from .._api import PolyPredicate, Categorization, PatientCategory @@ -10,7 +10,85 @@ class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta `GenotypePolyPredicate` is a base class for all :class:`PolyPredicate` that test the genotype axis. """ - pass + + @staticmethod + def filtering_predicate( + predicate: "GenotypePolyPredicate", + targets: typing.Collection[Categorization], + ) -> "GenotypePolyPredicate": + """ + """ + return FilteringGenotypePolyPredicate.create( + predicate=predicate, + targets=targets, + ) + + +class FilteringGenotypePolyPredicate(GenotypePolyPredicate): + # NOT PART OF THE PUBLIC API + + @staticmethod + def create( + predicate: "GenotypePolyPredicate", + targets: typing.Collection[Categorization], + ) -> "FilteringGenotypePolyPredicate": + # At least 2 target categorizations must be provided + if len(targets) <= 1: + raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}') + + good_boys = tuple(isinstance(cat, Categorization) for cat in targets) + if not all(good_boys): + offenders = ', '.join( + str(i) + for i, is_instance + in enumerate(good_boys) if not is_instance + ) + raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]') + + # All `allowed` categorizations must in fact be present in the `base` predicate. + cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets) + if not all(cats_are_in_fact_present): + missing = ', '.join( + c.category.name + for c, is_present + in zip(targets, cats_are_in_fact_present) if not is_present + ) + raise ValueError(f'Some from the categories are not present: {missing}') + + if len(targets) == predicate.n_categorizations(): + raise ValueError( + f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations ' + f'with the same number ({len(targets)}) of targets' + ) + + return FilteringGenotypePolyPredicate( + predicate=predicate, + allowed=targets, + ) + + def __init__( + self, + predicate: "GenotypePolyPredicate", + allowed: typing.Iterable[Categorization], + ): + self._predicate = predicate + self._allowed = tuple(allowed) + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return self._allowed + + def get_question(self) -> str: + return self._predicate.get_question() + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + cat = self._predicate.test(patient) + if cat in self._allowed: + return cat + else: + return None + + def __repr__(self): + return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta): diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 36904159f..adbcf532d 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -1,3 +1,5 @@ +import typing + import pytest from gpsea.model import * @@ -180,3 +182,83 @@ def test_x_recessive( assert categorization is not None assert categorization.category.name == name + + +class TestPolyPredicate: + + @pytest.fixture(scope="class") + def x_recessive_gt_predicate(self) -> GenotypePolyPredicate: + affects_suox = VariantPredicates.gene("SUOX") + return ModeOfInheritancePredicate.x_recessive( + variant_predicate=affects_suox, + ) + + @pytest.mark.parametrize( + "indices, expected", + [ + ((0, 1), 2), + ((1, 0), 2), + ((1, 2), 2), + ], + ) + def test_filtering_predicate( + self, + indices: typing.Sequence[int], + expected: int, + x_recessive_gt_predicate: GenotypePolyPredicate, + ): + cats = x_recessive_gt_predicate.get_categorizations() + targets = [cats[i] for i in indices] + predicate = GenotypePolyPredicate.filtering_predicate( + predicate=x_recessive_gt_predicate, + targets=targets, + ) + + actual = len(predicate.get_categorizations()) + + assert actual == expected + + def test_filtering_predicate__explodes_when_not_subsetting( + self, + x_recessive_gt_predicate: GenotypePolyPredicate, + ): + with pytest.raises(ValueError) as ve: + GenotypePolyPredicate.filtering_predicate( + predicate=x_recessive_gt_predicate, + targets=x_recessive_gt_predicate.get_categorizations(), + ) + + assert ( + ve.value.args[0] + == "It makes no sense to subset the a predicate with 4 categorizations with the same number (4) of targets" + ) + + def test_filtering_predicate__explodes_when_using_random_junk( + self, + x_recessive_gt_predicate: GenotypePolyPredicate, + ): + with pytest.raises(ValueError) as ve: + GenotypePolyPredicate.filtering_predicate( + predicate=x_recessive_gt_predicate, + targets=(0, 1), + ) + + assert ( + ve.value.args[0] + == "The targets at following indices are not categorizations: [0, 1]" + ) + + def test_filtering_predicate__explodes_when_using_one_category( + self, + x_recessive_gt_predicate: GenotypePolyPredicate, + ): + with pytest.raises(ValueError) as ve: + GenotypePolyPredicate.filtering_predicate( + predicate=x_recessive_gt_predicate, + targets=(x_recessive_gt_predicate.get_categorizations()[0],), + ) + + assert ( + ve.value.args[0] + == "At least 2 target categorizations must be provided but got 1" + ) From 21caddc68d76261ca9f494e62d8de818cfdb73b8 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 3 Sep 2024 17:22:05 +0200 Subject: [PATCH 15/16] Improve generating the polypredicate question. --- docs/tutorial.rst | 2 +- docs/user-guide/predicates.rst | 8 ++++---- docs/user-guide/stats.rst | 4 ++-- src/gpsea/analysis/_api.py | 2 +- src/gpsea/analysis/_gp_analysis.py | 4 ++-- src/gpsea/analysis/pcats/_impl.py | 6 +++--- src/gpsea/analysis/predicate/_api.py | 17 ++++++++++++++++- src/gpsea/analysis/predicate/genotype/_api.py | 4 ++-- .../predicate/genotype/_gt_predicates.py | 17 +++++++---------- .../analysis/predicate/phenotype/_pheno.py | 6 +++--- .../predicate/genotype/test_gt_predicates.py | 4 ++-- 11 files changed, 43 insertions(+), 31 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 29d8f2d9a..0a583b39e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -171,7 +171,7 @@ in the individuals of the *TBX5* cohort. ... ), ... group_names=('Missense', 'Frameshift'), ... ) ->>> gt_predicate.get_question() +>>> gt_predicate.display_question() 'Genotype group: Missense, Frameshift' .. note:: diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index 908afa147..6d95dfc5e 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -300,7 +300,7 @@ for assigning a patient into a genotype group: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.get_question() +>>> gt_predicate.display_question() 'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' The `gt_predicate` can be used in downstream analysis, such as in :class: @@ -337,7 +337,7 @@ for testing if the individual has at least one missense vs. frameshift vs. synon ... ), ... group_names=('Missense', 'Frameshift', 'Synonymous'), ... ) ->>> gt_predicate.get_question() +>>> gt_predicate.display_question() 'Genotype group: Missense, Frameshift, Synonymous' @@ -380,8 +380,8 @@ to test for a presence of `Abnormal lens morphology >> pheno_predicate.get_question() -'Is Abnormal lens morphology present in the patient?' +>>> pheno_predicate.display_question() +'Is Abnormal lens morphology present in the patient: Yes, No' TODO: explain ``missing_implies_phenotype_excluded`` diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 16f0060b7..5a8717f3b 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -138,7 +138,7 @@ we expect the autosomal dominant mode of inheritance: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) ->>> gt_predicate.get_question() +>>> gt_predicate.display_question() 'Which genotype group does the patient fit in: HOM_REF, HET' `gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group @@ -409,7 +409,7 @@ The genotype predicate will bin the patient into two groups: a point mutation gr ... predicates=(point_mutation, lof_mutation), ... group_names=('Point', 'LoF'), ... ) ->>> gt_predicate.get_question() +>>> gt_predicate.display_question() 'Genotype group: Point, LoF' diff --git a/src/gpsea/analysis/_api.py b/src/gpsea/analysis/_api.py index ca7bd8e49..b2b7d41d7 100644 --- a/src/gpsea/analysis/_api.py +++ b/src/gpsea/analysis/_api.py @@ -189,7 +189,7 @@ def summarize( # Column index: multiindex of counts and percentages for all genotype predicate groups geno_idx = pd.MultiIndex.from_product( iterables=(self._geno_predicate.get_categories(), ('Count', 'Percent')), - names=(self._geno_predicate.get_question(), None), + names=(self._geno_predicate.get_question_base(), None), ) # We'll fill this frame with data diff --git a/src/gpsea/analysis/_gp_analysis.py b/src/gpsea/analysis/_gp_analysis.py index 35d143053..f786049ce 100644 --- a/src/gpsea/analysis/_gp_analysis.py +++ b/src/gpsea/analysis/_gp_analysis.py @@ -63,11 +63,11 @@ def apply_predicates_on_patients( data=0, index=pd.Index( data=ph_predicate.get_categories(), - name=ph_predicate.get_question(), + name=ph_predicate.get_question_base(), ), columns=pd.Index( data=gt_predicate.get_categories(), - name=gt_predicate.get_question(), + name=gt_predicate.get_question_base(), ), ) diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index 408a9fd9c..e8045d2d0 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -64,11 +64,11 @@ def apply_predicates_on_patients( data=0, index=pd.Index( data=ph_predicate.get_categories(), - name=ph_predicate.get_question(), + name=ph_predicate.get_question_base(), ), columns=pd.Index( data=gt_predicate.get_categories(), - name=gt_predicate.get_question(), + name=gt_predicate.get_question_base(), ), ) @@ -198,7 +198,7 @@ def summarize( gt_idx = pd.MultiIndex.from_product( # TODO: fix the below iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")), - names=(self._gt_predicate.get_question(), None), + names=(self._gt_predicate.get_question_base(), None), ) # We'll fill this frame with data diff --git a/src/gpsea/analysis/predicate/_api.py b/src/gpsea/analysis/predicate/_api.py index da446a546..534eca18d 100644 --- a/src/gpsea/analysis/predicate/_api.py +++ b/src/gpsea/analysis/predicate/_api.py @@ -154,6 +154,12 @@ def get_categories(self) -> typing.Iterator[PatientCategory]: """ return (c.category for c in self.get_categorizations()) + def get_category_names(self) -> typing.Iterator[str]: + """ + Get an iterator with names of the :class:`PatientCategory` items that the predicate can produce. + """ + return (cat.name for cat in self.get_categories()) + def get_category( self, cat_id: int, @@ -182,12 +188,21 @@ def get_category_name( return self.get_category(cat_id).name @abc.abstractmethod - def get_question(self) -> str: + def get_question_base(self) -> str: """ Prepare a `str` with the question the predicate can answer. """ pass + def display_question(self) -> str: + """ + Prepare the question which the predicate can answer. + + The question includes the question base and the category names + """ + cat_names = ', '.join(self.get_category_names()) + return f'{self.get_question_base()}: {cat_names}' + @abc.abstractmethod def test(self, patient: Patient) -> typing.Optional[C]: """ diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py index a90bb5c88..db29c573d 100644 --- a/src/gpsea/analysis/predicate/genotype/_api.py +++ b/src/gpsea/analysis/predicate/genotype/_api.py @@ -77,8 +77,8 @@ def __init__( def get_categorizations(self) -> typing.Sequence[Categorization]: return self._allowed - def get_question(self) -> str: - return self._predicate.get_question() + def get_question_base(self) -> str: + return self._predicate.get_question_base() def test(self, patient: Patient) -> typing.Optional[Categorization]: cat = self._predicate.test(patient) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 23fc4a5df..4007e8d1b 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -39,7 +39,7 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: AlleleCountingGenotypeBooleanPredicate.NO, ) - def get_question(self) -> str: + def get_question_base(self) -> str: return self._allele_counter.get_question() def test(self, patient: Patient) -> typing.Optional[Categorization]: @@ -92,13 +92,12 @@ def __init__( ): self._counters = tuple(counters) self._categorizations = tuple(categorizations) - group_names = ", ".join(c.category.name for c in self._categorizations) - self._question = f"Genotype group: {group_names}" + self._question = "Genotype group" def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations - def get_question(self) -> str: + def get_question_base(self) -> str: return self._question def test(self, patient: Patient) -> typing.Optional[Categorization]: @@ -132,7 +131,7 @@ def __hash__(self) -> int: ) def __str__(self) -> str: - return self.get_question() + return self.get_question_base() def __repr__(self) -> str: return ( @@ -194,7 +193,7 @@ def __init__( ): self._allele_counter = allele_counter - def get_question(self) -> str: + def get_question_base(self) -> str: return self._allele_counter.get_question() def test(self, patient: Patient) -> typing.Optional[Categorization]: @@ -558,14 +557,12 @@ def __init__( issues = ModeOfInheritancePredicate._check_categorizations(self._categorizations) if issues: raise ValueError('Cannot create predicate: {}'.format(', '.join(issues))) - self._question = 'Which genotype group does the patient fit in: {}'.format( - ', '.join(cat.category.name for cat in self._categorizations), - ) + self._question = 'Which genotype group does the patient fit in' def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations - def get_question(self) -> str: + def get_question_base(self) -> str: return self._question def test( diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index ecfb14018..3b2d9a7b4 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -131,8 +131,8 @@ def __init__( # Some tests depend on the order of `self._categorizations`. self._categorizations = (self._phenotype_observed, self._phenotype_excluded) - def get_question(self) -> str: - return f"Is {self._query_label} present in the patient?" + def get_question_base(self) -> str: + return f"Is {self._query_label} present in the patient" @property def phenotype(self) -> hpotk.TermId: @@ -222,7 +222,7 @@ def __init__(self, disease_id_query: hpotk.TermId): phenotype=disease_id_query, ) - def get_question(self) -> str: + def get_question_base(self) -> str: return f"Was {self._query} diagnosed in the patient" @property diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index adbcf532d..97d4fee06 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -36,8 +36,8 @@ def test_get_question( self, predicate: GenotypePolyPredicate, ): - question = predicate.get_question() - assert question == "Genotype group: Point, LoF" + question = predicate.get_question_base() + assert question == "Genotype group" def test_get_categorizations( self, From 8ba618e3f71f599fd1629d19b104c307504c1fcd Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 3 Sep 2024 20:46:24 +0200 Subject: [PATCH 16/16] Improve docs for `filtering_predicate`. --- docs/user-guide/predicates.rst | 58 +++++++++++ .../analysis/predicate/genotype/__init__.py | 4 +- src/gpsea/analysis/predicate/genotype/_api.py | 82 +--------------- .../predicate/genotype/_gt_predicates.py | 95 +++++++++++++++++++ .../predicate/genotype/test_gt_predicates.py | 9 +- .../predicate/genotype/test_predicates.py | 1 + 6 files changed, 163 insertions(+), 86 deletions(-) diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index 6d95dfc5e..b224516f5 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -306,6 +306,64 @@ for assigning a patient into a genotype group: The `gt_predicate` can be used in downstream analysis, such as in :class: +.. _filtering-predicate: + +Filtering predicate +=================== + +Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need +to consider only a subset of the groups. A `GenotypePolyPredicate` +created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset +of the target categorizations of interest. + +Example +------- + +Let's suppose we want test the genotype-phenotype association between variants +that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, +and we are specifically interested in comparing the heterozygous variants +in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). + +First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> tx_id = 'NM_1234.5' +>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ +... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) +>>> is_frameshift_or_stop_gain.get_question() +'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' + +Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +to bin according to a genotype group: + +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate.display_question() +'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' + +We see that the `gt_predicate` bins the patients into three groups: + +>>> cats = gt_predicate.get_categorizations() +>>> cats +(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) + +We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, +and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +that includes only the categories of interest: + +>>> from gpsea.analysis.predicate.genotype import filtering_predicate +>>> fgt_predicate = filtering_predicate( +... predicate=gt_predicate, +... targets=(cats[1], cats[2]), +... ) +>>> fgt_predicate.display_question() +'Which genotype group does the patient fit in: HET, BIALLELIC_ALT' + + +.. _groups-predicate: + Groups predicate ================ diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index ff8d16e1b..86dd261a2 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,13 +1,13 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate +from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, recessive_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'recessive_predicate', + 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'recessive_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py index db29c573d..39cf020be 100644 --- a/src/gpsea/analysis/predicate/genotype/_api.py +++ b/src/gpsea/analysis/predicate/genotype/_api.py @@ -1,7 +1,7 @@ import abc import typing -from gpsea.model import Patient, Variant +from gpsea.model import Variant from .._api import PolyPredicate, Categorization, PatientCategory @@ -10,85 +10,7 @@ class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta `GenotypePolyPredicate` is a base class for all :class:`PolyPredicate` that test the genotype axis. """ - - @staticmethod - def filtering_predicate( - predicate: "GenotypePolyPredicate", - targets: typing.Collection[Categorization], - ) -> "GenotypePolyPredicate": - """ - """ - return FilteringGenotypePolyPredicate.create( - predicate=predicate, - targets=targets, - ) - - -class FilteringGenotypePolyPredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API - - @staticmethod - def create( - predicate: "GenotypePolyPredicate", - targets: typing.Collection[Categorization], - ) -> "FilteringGenotypePolyPredicate": - # At least 2 target categorizations must be provided - if len(targets) <= 1: - raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}') - - good_boys = tuple(isinstance(cat, Categorization) for cat in targets) - if not all(good_boys): - offenders = ', '.join( - str(i) - for i, is_instance - in enumerate(good_boys) if not is_instance - ) - raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]') - - # All `allowed` categorizations must in fact be present in the `base` predicate. - cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets) - if not all(cats_are_in_fact_present): - missing = ', '.join( - c.category.name - for c, is_present - in zip(targets, cats_are_in_fact_present) if not is_present - ) - raise ValueError(f'Some from the categories are not present: {missing}') - - if len(targets) == predicate.n_categorizations(): - raise ValueError( - f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations ' - f'with the same number ({len(targets)}) of targets' - ) - - return FilteringGenotypePolyPredicate( - predicate=predicate, - allowed=targets, - ) - - def __init__( - self, - predicate: "GenotypePolyPredicate", - allowed: typing.Iterable[Categorization], - ): - self._predicate = predicate - self._allowed = tuple(allowed) - - def get_categorizations(self) -> typing.Sequence[Categorization]: - return self._allowed - - def get_question_base(self) -> str: - return self._predicate.get_question_base() - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - cat = self._predicate.test(patient) - if cat in self._allowed: - return cat - else: - return None - - def __repr__(self): - return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" + pass class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta): diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 4007e8d1b..43a39594a 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -151,6 +151,8 @@ def groups_predicate( The genotype groups *should* not overlap. In case of an overlap, the patient will be assigned into no group (`None`). + See the :ref:`groups-predicate` section for an example. + :param predicates: an iterable with at least 2 variant predicates to determine a genotype group. :param group_names: an iterable with group names. The number of group names must match the number of predicates. """ @@ -182,6 +184,99 @@ def groups_predicate( ) +class FilteringGenotypePolyPredicate(GenotypePolyPredicate): + # NOT PART OF THE PUBLIC API + + @staticmethod + def create( + predicate: "GenotypePolyPredicate", + targets: typing.Collection[Categorization], + ) -> "FilteringGenotypePolyPredicate": + # At least 2 target categorizations must be provided + if len(targets) <= 1: + raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}') + + good_boys = tuple(isinstance(cat, Categorization) for cat in targets) + if not all(good_boys): + offenders = ', '.join( + str(i) + for i, is_instance + in enumerate(good_boys) if not is_instance + ) + raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]') + + # All `allowed` categorizations must in fact be present in the `base` predicate. + cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets) + if not all(cats_are_in_fact_present): + missing = ', '.join( + c.category.name + for c, is_present + in zip(targets, cats_are_in_fact_present) if not is_present + ) + raise ValueError(f'Some from the categories are not present: {missing}') + + if len(targets) == predicate.n_categorizations(): + raise ValueError( + f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations ' + f'with the same number ({len(targets)}) of targets' + ) + + return FilteringGenotypePolyPredicate( + predicate=predicate, + allowed=targets, + ) + + def __init__( + self, + predicate: "GenotypePolyPredicate", + allowed: typing.Iterable[Categorization], + ): + self._predicate = predicate + self._allowed = tuple(allowed) + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return self._allowed + + def get_question_base(self) -> str: + return self._predicate.get_question_base() + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + cat = self._predicate.test(patient) + if cat in self._allowed: + return cat + else: + return None + + def __repr__(self): + return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" + + +def filtering_predicate( + predicate: GenotypePolyPredicate, + targets: typing.Collection[Categorization], +) -> GenotypePolyPredicate: + """ + Filtering predicate applies the base `predicate` but only returns the categorizations + from the provided `targets` collection. + + This can be useful if only some of the categorizations are interesting. + For instance, if we only seek to compare the differences between heterozygous and hemizygous variants, + but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups. + + See the :ref:`filtering-predicate` section for an example. + + The `predicate` is checked for being able to produce the all items in `targets` + and the `targets` must include at least 2 categorizations. + + :param predicate: the base predicate whose categorizations are subject to filteration. + :param targets: the categorizations to retain + """ + return FilteringGenotypePolyPredicate.create( + predicate=predicate, + targets=targets, + ) + + class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate): # NOT PART OF THE PUBLIC API # TODO: this predicate is a bit weird and I think it should eventually go away. diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 97d4fee06..67bc2d117 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -6,6 +6,7 @@ from gpsea.analysis.predicate.genotype import ( GenotypePolyPredicate, groups_predicate, + filtering_predicate, VariantPredicates, VariantPredicate, ModeOfInheritancePredicate, @@ -209,7 +210,7 @@ def test_filtering_predicate( ): cats = x_recessive_gt_predicate.get_categorizations() targets = [cats[i] for i in indices] - predicate = GenotypePolyPredicate.filtering_predicate( + predicate = filtering_predicate( predicate=x_recessive_gt_predicate, targets=targets, ) @@ -223,7 +224,7 @@ def test_filtering_predicate__explodes_when_not_subsetting( x_recessive_gt_predicate: GenotypePolyPredicate, ): with pytest.raises(ValueError) as ve: - GenotypePolyPredicate.filtering_predicate( + filtering_predicate( predicate=x_recessive_gt_predicate, targets=x_recessive_gt_predicate.get_categorizations(), ) @@ -238,7 +239,7 @@ def test_filtering_predicate__explodes_when_using_random_junk( x_recessive_gt_predicate: GenotypePolyPredicate, ): with pytest.raises(ValueError) as ve: - GenotypePolyPredicate.filtering_predicate( + filtering_predicate( predicate=x_recessive_gt_predicate, targets=(0, 1), ) @@ -253,7 +254,7 @@ def test_filtering_predicate__explodes_when_using_one_category( x_recessive_gt_predicate: GenotypePolyPredicate, ): with pytest.raises(ValueError) as ve: - GenotypePolyPredicate.filtering_predicate( + filtering_predicate( predicate=x_recessive_gt_predicate, targets=(x_recessive_gt_predicate.get_categorizations()[0],), ) diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py index 2f78d7185..a076adacf 100644 --- a/tests/analysis/predicate/genotype/test_predicates.py +++ b/tests/analysis/predicate/genotype/test_predicates.py @@ -202,6 +202,7 @@ def test_protein_feature_id( assert predicate.test(missense_variant) == expected + class TestLogicalVariantPredicate: """ Test that the AND and OR variant predicate combinators work as expected.
Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.
Code
TODOSkipping general term44Skipping term with maximum frequency that was less than threshold 0.251
TODOSkipping term because all genotypes have same HPO observed proportions42Skipping general term44
TODOSkipping term with only 0 observations (not powered for 2x2)Skipping term because all genotypes have same HPO observed proportions 41
12
TODOSkipping term with maximum frequency that was less than threshold 0.210
TODO