From 3819244f331929955ce0a898a170599fbfc467c0 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 6 Sep 2024 13:33:19 +0200 Subject: [PATCH 1/2] updating documentation (WIP) --- README.md | 3 +- docs/index.rst | 4 +- docs/user-guide/index.rst | 9 +- docs/user-guide/predicates.rst | 466 +----------------- docs/user-guide/{ => predicates}/devries.rst | 0 .../predicates/diagnosis_predicate.rst | 21 + .../predicates/filtering_predicate.rst | 56 +++ .../predicates/genotype_predicates.rst | 33 ++ .../predicates/groups_predicate.rst | 41 ++ docs/user-guide/predicates/hpo_predicate.rst | 44 ++ .../predicates/male_female_predicate.rst | 21 + .../mode_of_inheritance_predicate.rst | 97 ++++ .../{ => predicates}/phenotype_predicates.rst | 1 + .../predicates/variant_predicates.rst | 168 +++++++ docs/user-guide/report/tbx5_frameshift.csv | 2 +- docs/user-guide/stats.rst | 2 +- .../predicate/genotype/_gt_predicates.py | 2 +- 17 files changed, 511 insertions(+), 459 deletions(-) rename docs/user-guide/{ => predicates}/devries.rst (100%) create mode 100644 docs/user-guide/predicates/diagnosis_predicate.rst create mode 100644 docs/user-guide/predicates/filtering_predicate.rst create mode 100644 docs/user-guide/predicates/genotype_predicates.rst create mode 100644 docs/user-guide/predicates/groups_predicate.rst create mode 100644 docs/user-guide/predicates/hpo_predicate.rst create mode 100644 docs/user-guide/predicates/male_female_predicate.rst create mode 100644 docs/user-guide/predicates/mode_of_inheritance_predicate.rst rename docs/user-guide/{ => predicates}/phenotype_predicates.rst (94%) create mode 100644 docs/user-guide/predicates/variant_predicates.rst diff --git a/README.md b/README.md index 2a498eb4..449314b9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ ![PyPi downloads](https://img.shields.io/pypi/dm/gpsea.svg?label=Pypi%20downloads) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gpsea) -GPSEA is a Python library for discovery of genotype-phenotype associations. +GPSEA (Genotypes and Ghenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. + See the [Tutorial](https://monarch-initiative.github.io/gpsea/stable/tutorial.html) and a comprehensive [User guide](https://monarch-initiative.github.io/gpsea/stable/user-guide/index.html) diff --git a/docs/index.rst b/docs/index.rst index f9478fa2..b70bf9c2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,8 +10,8 @@ A key question in biology and human genetics concerns the relationships between genetics, the focus is generally placed on the study of whether specific disease-causing alleles are associated with specific phenotypic manifestations of the disease. -`GPSEA` (genotypes and phenotypes - study and evaluation of associations) is a Python package designed to support genotype-phenotype correlation analysis. -We pronounce GPSEA as "G"-"P"-"C". The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. +`GPSEA` (Genotypes and Ghenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. +The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. `gpsea` ingests data from these phenopackets and performs analysis of the correlation of specific variants, variant types (e.g., missense vs. premature termination codon), or variant location in protein motifs or other features. The phenotypic abnormalities are represented by `Human Phenotype Ontology (HPO) `_ terms. diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index f3e0569a..08097f40 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -4,7 +4,13 @@ User guide ========== -TODO - write high level overview and bridge to individual sections. +GPSEA allows users to perform many different kinds of genotype-phenotype correlation (GPCs) analysis. See the :ref:`tutorial` for an introduction. +In general, the analysis will include steps for data input, exploration of the cohort to assist in generating hypotheses about potential GPCs to test, +corresponding choice of genotype and phenotype predicates to perform the test, choice of statistical test, and approach to multiple-testing correction (mtc). +The pages shown in the table of contents provide more information about each step. + + + .. toctree:: :maxdepth: 1 @@ -13,7 +19,6 @@ TODO - write high level overview and bridge to individual sections. input-data exploratory predicates - phenotype_predicates stats mtc glossary diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index 785c3ef0..ed63f9d3 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -4,484 +4,48 @@ Predicates ========== -Searching for genotype-phenotype associations usually requires to partition -the individuals into several discrete groups to allow testing for the inter-group differences. +Searching for genotype-phenotype associations usually requires that +the individuals are partitioned into two or more discrete groups to allow testing for the inter-group differences. GPSEA reflects these requirements with its predicate API. -Perhaps unsurprisingly, a predicate must be capable of partitioning the individuals into two or more groups. + + +A **predicate** must be capable of partitioning the individuals into two or more groups. The groups must be *exclusive* - each individual must be assigned at most into one group. -Moreover, the groups should be *exhaustive* and cover maximum of the possible states. +In general, it is desirable that the groups cover all or at least the majority of the cohort being analyzed to maximize statistical power. However, the predicate is allowed to return `None` if the individual cannot be assigned. -In result, the individual will be omitted from the downstream analysis. +As a result, the individual will be omitted from the downstream analysis. -Predicates serve both *genotype* and *phenotype* prongs of the analysis. +Predicates serve both the *genotype* and *phenotype* prongs of the analysis. Genotype predicates (:class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`) assign the :class:`~gpsea.model.Patient` into a group (mostly) based on the variant information, while the phenotype predicates (:class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate`) use the HPO terms to assign a group. -It is literally impossible to use GPSEA without the predicates -because all analyses need at least one predicate (typically a *genotype* predicate). -Luckily, the purpose of this guide is to show all that is to know about predicates. -We will first discuss the genotype predicates and end with phenotype predicates. +All GPSEA analyses need at least one predicate (typically a *genotype* predicate) and many require both *genotype* and *phenotype* predicates. +The following pages provide more information. .. _genotype-predicates: -******************* -Genotype predicates -******************* - -A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes. -Typically, this includes using the genotype data, such as presence of a missense variant -in a heterozygous genotype. However, other categorical variables, -such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used. - -The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria. -The testing is done in two steps. First, we count the alleles -of the matching variants and then we interpret the count, possibly including factors -such as the expected mode of inheritance and sex, to assign the individual into a group. -Finding the matching variants is what -the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. - - -TODO: wordsmith -We must first create the variant predicate and then wrap it in genotype predicate. - - -Variant predicates -================== - -GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class -to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria. -The variant predicate can leverage multiple primary data: - -+------------------------+-------------------------------------------------------------------------------------------------+ -| Primary data source | Example | -+========================+=================================================================================================+ -| Allele | the variant being a deletion or a single nucleotide variant (SNV) | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Genome | overlaps of a target genomic region | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Functional annotation | variant is predicted to lead to a missense change or affect an exon of certain transcript | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Protein data | variant is located in a region encoding a protein domain, protein feature type | -+------------------------+-------------------------------------------------------------------------------------------------+ - -which demands a considerable amount of flexibility for creating the predicate. - -As a rule of thumb, the predicates for testing basic conditions are available off the shelf, -and they can be used as building block for testing for more complex conditions, -such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`". - -Let's demonstrate this on few examples. -We will load a cohort of 19 subjects with variants in *RERE*, -leading to `Holt-Oram syndrome MIM:142900 `_. -The the clinical signs and symptoms of the subjects were encoded into HPO terms -along with the pathogenic *RERE* variant. - -Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section. -Briefly, we first load HPO: - ->>> import hpotk ->>> store = hpotk.configure_ontology_store() ->>> hpo = store.load_minimal_hpo(release='v2024-07-01') - -then, we configure the cohort creator: - ->>> from gpsea.preprocessing import configure_caching_cohort_creator ->>> cohort_creator = configure_caching_cohort_creator(hpo) - -which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets -from the release `0.1.18` of `Phenopacket Store `_. -We will load 19 individuals with mutations in *RERE* gene: - ->>> from ppktstore.registry import configure_phenopacket_registry ->>> registry = configure_phenopacket_registry() ->>> with registry.open_phenopacket_store(release='0.1.18') as ps: -... phenopackets = tuple(ps.iter_cohort_phenopackets('RERE')) ->>> len(phenopackets) -19 - -and we will convert the phenopacket into a :class:`~gpsea.model.Cohort`: - ->>> from gpsea.preprocessing import load_phenopackets ->>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... - -To demonstrate the predicate API, we will use the variant ``1_8358231_8358231_T_C`` that corresponds -to a pathogenic variant `VCV000522858.5 `_ -that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with arginine: ``NM_001042681.2(RERE):c.4304A>G (p.His1435Arg)``. - ->>> variant_key_of_interest = '1_8358231_8358231_T_C' ->>> variant = cohort.get_variant_by_key(variant_key_of_interest) - -Building blocks ---------------- - -We can check that the variant overlaps with *RERE*: - ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> gene = VariantPredicates.gene('RERE') ->>> gene.test(variant) -True - -it overlaps with the *MANE* transcript: - ->>> rere_mane_tx_id = 'NM_001042681.2' ->>> tx = VariantPredicates.transcript(rere_mane_tx_id) ->>> tx.test(variant) -True - -it in fact overlaps with the exon 20: - ->>> exon20 = VariantPredicates.exon(exon=20, tx_id=rere_mane_tx_id) ->>> exon20.test(variant) -True - -and leads to a missense mutation with respect to the MANE transcript: - ->>> from gpsea.model import VariantEffect ->>> missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=rere_mane_tx_id) ->>> missense.test(variant) -True - -See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` -for more info on the predicates available off the shelf. - - -Complex conditions ------------------- - -We can combine the building blocks to test for more elaborate conditions. -For instance, we can test if the variant meets *any* or several conditions: ->>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) ->>> missense_or_nonsense = missense | nonsense ->>> missense_or_nonsense.test(variant) -True -or *all* conditions: +.. toctree:: + :maxdepth: 1 + :caption: Contents: ->>> missense_and_exon20 = missense & exon20 ->>> missense_and_exon20.test(variant) -True + predicates/phenotype_predicates + predicates/genotype_predicates -The `VariantPredicate` overloads Python ``&`` (AND) and ``|`` (OR) operators to build a compound predicate from lower level building blocks. -Therefore, there is nothing that prevents us to combine the predicates into multi-level tests, -such as testing if the variant is a *"chromosomal deletion" or a deletion which removes at least 50 bp*: - ->>> from gpsea.model import VariantClass ->>> chromosomal_deletion = "SO:1000029" ->>> predicate = VariantPredicates.structural_type(chromosomal_deletion) | (VariantPredicates.variant_class(VariantClass.DEL) & VariantPredicates.change_length("<=", -50)) ->>> predicate.get_question() -'(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))' - - -Inverting conditions --------------------- - -Sometimes we may want to test the variant for a condition that must *not* be met. -For instance, we may want to test if the variant is a deletion -that is *not* predicted to shift the transcript reading frame. -One of doing this would be to build a compound predicates -for all variant effects except of :class:`~gpsea.model.VariantEffect.FRAMESHIFT_VARIANT`: - ->>> non_frameshift_effects = ( -... VariantEffect.SYNONYMOUS_VARIANT, VariantEffect.MISSENSE_VARIANT, VariantEffect.INTRON_VARIANT, -... # and many more effects.. -... ) ->>> non_frameshift_predicate = VariantPredicates.all(VariantPredicates.variant_effect(eff, tx_id=rere_mane_tx_id) for eff in non_frameshift_effects) - -However, this is clearly tedious and it would be much better implemented -by a simple logical not of a predicate for a frameshift variant effect. - -To support this, `VariantPredicate` implements *logical inversion* -which corresponds to Python's ``~`` operator (tilde), to wrap -the underlying predicate and to invert its test result. - -This is how we can use the predicate inversion to build the predicate for non-frameshift deletions: - ->>> non_frameshift_del = ~VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id=rere_mane_tx_id) & VariantPredicates.variant_class(VariantClass.DEL) ->>> non_frameshift_del.get_question() -'(NOT FRAMESHIFT_VARIANT on NM_001042681.2 AND variant class is DEL)' - -Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question. - -The variant predicate offers a flexible API for testing if variants meet a condition. -However, the genotype phenotype correlations are done on the individual level -and the variant predicates are used as a component of the genotype predicate. -The next sections show how to use variant predicates to assign individuals into groups. - - -Mode of inheritance predicate -============================= - -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` -assigns the individual into a group based on the number of alleles -that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports -the following Mendelian modes of inheritance (MoI): - - -+-----------------------+-----------------------------------+------------------+------------------------+ -| Mode of inheritance | Sex | Allele count | Genotype category | -+=======================+===================================+==================+========================+ -| Autosomal dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| Autosomal recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 2 | `BIALLELIC_ALT` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 3` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` | -+ + +------------------+------------------------+ -| | | 2 | `BIALLELIC_ALT` | -+ + +------------------+------------------------+ -| | | :math:`\ge 3` | ``None`` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` | -+ + +------------------+------------------------+ -| | | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ - -.. note:: - - `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. - -Clinical judgment should be used to choose the MoI for the cohort analysis. -Then a predicate for the desired MoI can be created by one of -:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: - -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive` - -All constructors take an instance -of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument. - - -Example -------- - -Here we show seting up a predicate for grouping individuals based on -having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5`` -to test differences between the genotypes of a disease with an autosomal recessive MoI. - -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant meets the condition: - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' - -Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -for assigning a patient into a genotype group: - ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' - -The `gt_predicate` can be used in downstream analysis, such as in :class: - - -.. _filtering-predicate: - -Filtering predicate -=================== - -Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need -to consider only a subset of the groups. A `GenotypePolyPredicate` -created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset -of the target categorizations of interest. - -Example -------- - -Let's suppose we want test the genotype-phenotype association between variants -that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, -and we are specifically interested in comparing the heterozygous variants -in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). - -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' - -Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -to bin according to a genotype group: - ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' - -We see that the `gt_predicate` bins the patients into three groups: - ->>> cats = gt_predicate.get_categorizations() ->>> cats -(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) - -We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, -and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -that includes only the categories of interest: - ->>> from gpsea.analysis.predicate.genotype import filtering_predicate ->>> fgt_predicate = filtering_predicate( -... predicate=gt_predicate, -... targets=(cats[1], cats[2]), -... ) ->>> fgt_predicate.display_question() -'Which genotype group does the patient fit in: HET, BIALLELIC_ALT' .. _groups-predicate: -Groups predicate -================ - -Sometimes, all we want is to compare if there is a difference between individuals -who include one or more alleles of variant $X$ vs. individuals with variants $Y$, -vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates. -We can do this with a *groups* predicate. - -The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` -takes *n* variant predicates and *n* group labels, and it will assign the patients -into the respective groups if one or more matching allele is found. -However, only one predicate is allowed to return a non-zero allele count. -Otherwise, the patient is assigned with ``None`` and excluded from the analysis. - -Example -------- - -Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -for testing if the individual has at least one missense vs. frameshift vs. synonymous variant. - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate ->>> tx_id = 'NM_1234.5' ->>> gt_predicate = groups_predicate( -... predicates=( -... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id), -... ), -... group_names=('Missense', 'Frameshift', 'Synonymous'), -... ) ->>> gt_predicate.display_question() -'Genotype group: Missense, Frameshift, Synonymous' - - -.. _sex-predicate: - -Partition by the sex of the individual -====================================== - -It is easy to investigate the phenotypic differences between females and males. -The :func:`~gpsea.analysis.predicate.genotype.sex_predicate` provides a predicate -for partitioning based on the sex of the individual: - ->>> from gpsea.analysis.predicate.genotype import sex_predicate ->>> gt_predicate = sex_predicate() ->>> gt_predicate.display_question() -'Sex of the individual: FEMALE, MALE' - -The individuals with :class:`~gpsea.model.Sex.UNKNOWN_SEX` will be omitted from the analysis. - - -.. _diagnosis-predicate: - -Partition by a diagnosis -======================== - -It is also possible to bin the individuals based on a diagnosis. -The :func:`~gpsea.analysis.predicate.genotype.diagnosis_predicate` -prepares a genotype predicate for assigning an individual into a diagnosis group: - ->>> from gpsea.analysis.predicate.genotype import diagnosis_predicate ->>> gt_predicate = diagnosis_predicate( -... diagnoses=('OMIM:154700', 'OMIM:129600'), -... labels=('Marfan syndrome', 'Ectopia lentis, familial'), -... ) ->>> gt_predicate.display_question() -'What disease was diagnosed: OMIM:154700, OMIM:129600' - -Note, an individual must match only one diagnosis group. Any individuals labeled with two or more diagnoses -(e.g. an individual with both *Marfan syndrome* and *Ectopia lentis, familial*) -will be automatically omitted from the analysis. .. _phenotype-predicates: -******************** -Phenotype predicates -******************** - -The phenotype predicate assigns the individual into a group with respect to tested phenotype. -Typically, the phenotype corresponds to a clinical sign or symptom encoded into an HPO term. - - -Propagating phenotype predicate -=============================== - -When testing for presence or absence of an HPO term, the propagating phenotype predicate -leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. -In result, an individual annotated with a term is implicitly annotated with all its ancestors. -For instance, an individual annotated with `Ectopia lentis `_ -is also annotated with `Abnormal lens morphology `_, -`Abnormal anterior eye segment morphology `_, -`Abnormal eye morphology `_, ... - -Similarly, all descendants of a term, whose presence was specifically excluded in an individual, -are implicitly excluded. - -:class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` implements this logic. - -Example -------- - -Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` -to test for a presence of `Abnormal lens morphology `_. - - ->>> from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate ->>> query = hpotk.TermId.from_curie('HP:0000517') ->>> pheno_predicate = PropagatingPhenotypePredicate( -... hpo=hpo, -... query=query, -... ) ->>> pheno_predicate.display_question() -'Is Abnormal lens morphology present in the patient: Yes, No' - -TODO: explain ``missing_implies_phenotype_excluded`` Predicates for all cohort phenotypes diff --git a/docs/user-guide/devries.rst b/docs/user-guide/predicates/devries.rst similarity index 100% rename from docs/user-guide/devries.rst rename to docs/user-guide/predicates/devries.rst diff --git a/docs/user-guide/predicates/diagnosis_predicate.rst b/docs/user-guide/predicates/diagnosis_predicate.rst new file mode 100644 index 00000000..6ce811ab --- /dev/null +++ b/docs/user-guide/predicates/diagnosis_predicate.rst @@ -0,0 +1,21 @@ +.. _diagnosis_predicate: + +======================== +Partition by a diagnosis +======================== + +It is also possible to bin the individuals based on a diagnosis. +The :func:`~gpsea.analysis.predicate.genotype.diagnosis_predicate` +prepares a genotype predicate for assigning an individual into a diagnosis group: + +>>> from gpsea.analysis.predicate.genotype import diagnosis_predicate +>>> gt_predicate = diagnosis_predicate( +... diagnoses=('OMIM:154700', 'OMIM:129600'), +... labels=('Marfan syndrome', 'Ectopia lentis, familial'), +... ) +>>> gt_predicate.display_question() +'What disease was diagnosed: OMIM:154700, OMIM:129600' + +Note, an individual must match only one diagnosis group. Any individuals labeled with two or more diagnoses +(e.g. an individual with both *Marfan syndrome* and *Ectopia lentis, familial*) +will be automatically omitted from the analysis. \ No newline at end of file diff --git a/docs/user-guide/predicates/filtering_predicate.rst b/docs/user-guide/predicates/filtering_predicate.rst new file mode 100644 index 00000000..2d47e1d3 --- /dev/null +++ b/docs/user-guide/predicates/filtering_predicate.rst @@ -0,0 +1,56 @@ +.. _filtering-predicate: + + +=================== +Filtering predicate +=================== + +Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need +to consider only a subset of the groups. A `GenotypePolyPredicate` +created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset +of the target categorizations of interest. + +Example +------- + +Let's suppose we want test the genotype-phenotype association between variants +that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, +and we are specifically interested in comparing the heterozygous variants +in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). + +First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> tx_id = 'NM_1234.5' +>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ +... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) +>>> is_frameshift_or_stop_gain.get_question() +'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' + +Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +to bin according to a genotype group: + +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate.display_question() +'What is the genotype group?: HOM_REF, HET, BIALLELIC_ALT' + +We see that the `gt_predicate` bins the patients into three groups: + +>>> cats = gt_predicate.get_categorizations() +>>> cats +(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) + +We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, +and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +that includes only the categories of interest: + +>>> from gpsea.analysis.predicate.genotype import filtering_predicate +>>> fgt_predicate = filtering_predicate( +... predicate=gt_predicate, +... targets=(cats[1], cats[2]), +... ) +>>> fgt_predicate.display_question() +'What is the genotype group?: HET, BIALLELIC_ALT' \ No newline at end of file diff --git a/docs/user-guide/predicates/genotype_predicates.rst b/docs/user-guide/predicates/genotype_predicates.rst new file mode 100644 index 00000000..8a725b6e --- /dev/null +++ b/docs/user-guide/predicates/genotype_predicates.rst @@ -0,0 +1,33 @@ +.. _genotype_predicates: + +=================== +Genotype Predicates +=================== + + +A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes. +Typically, this includes using the genotype data, such as presence of a missense variant +in a heterozygous genotype. However, other categorical variables, +such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used. + +The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria. +The testing is done in two steps. First, we count the alleles +of the matching variants and then we interpret the count, possibly including factors +such as the expected mode of inheritance and sex, to assign the individual into a group. +Finding the matching variants is what +the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. + + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + variant_predicates + mode_of_inheritance_predicate + filtering_predicate + male_female_predicate + diagnosis_predicate + groups_predicate + + + diff --git a/docs/user-guide/predicates/groups_predicate.rst b/docs/user-guide/predicates/groups_predicate.rst new file mode 100644 index 00000000..23f178ac --- /dev/null +++ b/docs/user-guide/predicates/groups_predicate.rst @@ -0,0 +1,41 @@ +.. _groups_predicate: + +================ +Groups Predicate +================ + + + +Sometimes, all we want is to compare if there is a difference between individuals +who include one or more alleles of variant $X$ vs. individuals with variants $Y$, +vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates. +We can do this with a *groups* predicate. + +The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` +takes *n* variant predicates and *n* group labels, and it will assign the patients +into the respective groups if one or more matching allele is found. +However, only one predicate is allowed to return a non-zero allele count. +Otherwise, the patient is assigned with ``None`` and excluded from the analysis. + +Example +------- + +Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +for testing if the individual has at least one missense vs. frameshift vs. synonymous variant. + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate +>>> tx_id = 'NM_1234.5' +>>> gt_predicate = groups_predicate( +... predicates=( +... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id), +... ), +... group_names=('Missense', 'Frameshift', 'Synonymous'), +... ) +>>> gt_predicate.display_question() +'Genotype group: Missense, Frameshift, Synonymous' + + + diff --git a/docs/user-guide/predicates/hpo_predicate.rst b/docs/user-guide/predicates/hpo_predicate.rst new file mode 100644 index 00000000..fada58ec --- /dev/null +++ b/docs/user-guide/predicates/hpo_predicate.rst @@ -0,0 +1,44 @@ +.. _hpo_predicate: + + +Propagating phenotype predicate +=============================== + +When testing for presence or absence of an HPO term, the propagating phenotype predicate +leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. +In result, an individual annotated with a term is implicitly annotated with all its ancestors. +For instance, an individual annotated with `Ectopia lentis `_ +is also annotated with `Abnormal lens morphology `_, +`Abnormal anterior eye segment morphology `_, +`Abnormal eye morphology `_, ... + +Similarly, all descendants of a term, whose presence was specifically excluded in an individual, +are implicitly excluded. + +:class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` implements this logic. + +Example +------- + +Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` +to test for a presence of `Abnormal lens morphology `_. + + +>>> from gpsea.analysis.predicate.phenotype import HpoPredicate +>>> query = hpotk.TermId.from_curie('HP:0000517') +>>> pheno_predicate = HpoPredicate( +... hpo=hpo, +... query=query, +... ) +>>> pheno_predicate.display_question() +'Is Abnormal lens morphology present in the patient: Yes, No' + + + +missing_implies_phenotype_excluded +---------------------------------- + +In many cases, published reports of clinical data about individuals with rare diseases describes phenotypic features that were observed, but do not +provide a comprehensive list of features that were explicitly excluded. By default, GPSEA will only include features that are recorded as observed or excluded in a phenopacket. +Setting this argument to True will cause "n/a" entries to be set to "excluded". We provide this option for exploration but do not recommend its use for the +final analysis unless the assumption behind it is known to be true. \ No newline at end of file diff --git a/docs/user-guide/predicates/male_female_predicate.rst b/docs/user-guide/predicates/male_female_predicate.rst new file mode 100644 index 00000000..8f16c96b --- /dev/null +++ b/docs/user-guide/predicates/male_female_predicate.rst @@ -0,0 +1,21 @@ +.. _male_female_predicate: + +Partition by the sex of the individual +====================================== + +It is easy to investigate the phenotypic differences between females and males. +The :func:`~gpsea.analysis.predicate.genotype.sex_predicate` provides a predicate +for partitioning based on the sex of the individual: + +>>> from gpsea.analysis.predicate.genotype import sex_predicate +>>> gt_predicate = sex_predicate() +>>> gt_predicate.display_question() +'Sex of the individual: FEMALE, MALE' + +The individuals with :class:`~gpsea.model.Sex.UNKNOWN_SEX` will be omitted from the analysis. + +Note that we have implemented this predicate as a genotype predicate, because it is used in +place of other genotype predicates. Currently, it is not possible to compare the distribution of genotypes across sexes. + + + diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst new file mode 100644 index 00000000..a00570d0 --- /dev/null +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -0,0 +1,97 @@ +.. _mode_of_inheritance_predicate: + +============================== +Mode of Inheritance Predicates +============================== + +There are five basic modes of inheritance for single-gene diseases: autosomal dominant, autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial (See +`Understanding Genetics, Appendix B `_). + + +The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` +assigns the individual into a group based on the number of alleles +that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. +The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports +the following Mendelian modes of inheritance (MoI): + + ++-----------------------+-----------------------------------+------------------+------------------------+ +| Mode of inheritance | Sex | Allele count | Genotype category | ++=======================+===================================+==================+========================+ +| Autosomal dominant | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| Autosomal recessive | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 2 | `BIALLELIC_ALT` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 3` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| X-linked dominant | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| X-linked recessive | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` | ++ + +------------------+------------------------+ +| | | 2 | `BIALLELIC_ALT` | ++ + +------------------+------------------------+ +| | | :math:`\ge 3` | ``None`` | ++ +-----------------------------------+------------------+------------------------+ +| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` | ++ + +------------------+------------------------+ +| | | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ + +.. note:: + + `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. + +Clinical judgment should be used to choose the MoI for the cohort analysis. +Then a predicate for the desired MoI can be created by one of +:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: + +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive` + +All constructors take an instance +of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument. + + +Example +------- + +Here we show seting up a predicate for grouping individuals based on +having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5`` +to test differences between the genotypes of a disease with an autosomal recessive MoI. + +First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +for testing if a variant meets the condition: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> tx_id = 'NM_1234.5' +>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ +... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) +>>> is_frameshift_or_stop_gain.get_question() +'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' + +Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +for assigning a patient into a genotype group: + +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate.display_question() +'What is the genotype group?: HOM_REF, HET, BIALLELIC_ALT' + +The `gt_predicate` can be used in downstream analysis, such as in :class: diff --git a/docs/user-guide/phenotype_predicates.rst b/docs/user-guide/predicates/phenotype_predicates.rst similarity index 94% rename from docs/user-guide/phenotype_predicates.rst rename to docs/user-guide/predicates/phenotype_predicates.rst index 86572657..b8991a29 100644 --- a/docs/user-guide/phenotype_predicates.rst +++ b/docs/user-guide/predicates/phenotype_predicates.rst @@ -15,4 +15,5 @@ TODO -- separate explanations for HPO (Fisher), scores (Mann Whitney) and surviv :maxdepth: 1 :caption: Contents: + hpo_predicate devries diff --git a/docs/user-guide/predicates/variant_predicates.rst b/docs/user-guide/predicates/variant_predicates.rst new file mode 100644 index 00000000..cce6130c --- /dev/null +++ b/docs/user-guide/predicates/variant_predicates.rst @@ -0,0 +1,168 @@ +.. _variant_predicates: + +================== +Variant Predicates +================== + + +GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class +to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria. +The variant predicate can leverage multiple primary data: + ++------------------------+-------------------------------------------------------------------------------------------------+ +| Primary data source | Example | ++========================+=================================================================================================+ +| Allele | the variant being a deletion or a single nucleotide variant (SNV) | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Genome | overlaps of a target genomic region | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Functional annotation | variant is predicted to lead to a missense change or affect an exon of certain transcript | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Protein data | variant is located in a region encoding a protein domain, protein feature type | ++------------------------+-------------------------------------------------------------------------------------------------+ + +which demands a considerable amount of flexibility for creating the predicate. + +As a rule of thumb, the predicates for testing basic conditions are available off the shelf, +and they can be used as building block for testing for more complex conditions, +such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`". + +Let's demonstrate this on few examples. +We will load a cohort of 19 subjects with variants in *RERE*, +leading to `Holt-Oram syndrome MIM:142900 `_. +The the clinical signs and symptoms of the subjects were encoded into HPO terms +along with the pathogenic *RERE* variant. + +Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section. +Briefly, we first load HPO: + +>>> import hpotk +>>> store = hpotk.configure_ontology_store() +>>> hpo = store.load_minimal_hpo(release='v2024-07-01') + +then, we configure the cohort creator: + +>>> from gpsea.preprocessing import configure_caching_cohort_creator +>>> cohort_creator = configure_caching_cohort_creator(hpo) + +which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets +from the release `0.1.18` of `Phenopacket Store `_. +We will load 19 individuals with mutations in *RERE* gene: + +>>> from ppktstore.registry import configure_phenopacket_registry +>>> registry = configure_phenopacket_registry() +>>> with registry.open_phenopacket_store(release='0.1.18') as ps: +... phenopackets = tuple(ps.iter_cohort_phenopackets('RERE')) +>>> len(phenopackets) +19 + +and we will convert the phenopacket into a :class:`~gpsea.model.Cohort`: + +>>> from gpsea.preprocessing import load_phenopackets +>>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE +Patients Created: ... + +To demonstrate the predicate API, we will use the variant ``1_8358231_8358231_T_C`` that corresponds +to a pathogenic variant `VCV000522858.5 `_ +that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with arginine: ``NM_001042681.2(RERE):c.4304A>G (p.His1435Arg)``. + +>>> variant_key_of_interest = '1_8358231_8358231_T_C' +>>> variant = cohort.get_variant_by_key(variant_key_of_interest) + +Building blocks +--------------- + +We can check that the variant overlaps with *RERE*: + +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> gene = VariantPredicates.gene('RERE') +>>> gene.test(variant) +True + +it overlaps with the *MANE* transcript: + +>>> rere_mane_tx_id = 'NM_001042681.2' +>>> tx = VariantPredicates.transcript(rere_mane_tx_id) +>>> tx.test(variant) +True + +it in fact overlaps with the exon 20: + +>>> exon20 = VariantPredicates.exon(exon=20, tx_id=rere_mane_tx_id) +>>> exon20.test(variant) +True + +and leads to a missense mutation with respect to the MANE transcript: + +>>> from gpsea.model import VariantEffect +>>> missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=rere_mane_tx_id) +>>> missense.test(variant) +True + +See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` +for more info on the predicates available off the shelf. + + +Complex conditions +------------------ + +We can combine the building blocks to test for more elaborate conditions. +For instance, we can test if the variant meets *any* or several conditions: + +>>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) +>>> missense_or_nonsense = missense | nonsense +>>> missense_or_nonsense.test(variant) +True + +or *all* conditions: + +>>> missense_and_exon20 = missense & exon20 +>>> missense_and_exon20.test(variant) +True + +The `VariantPredicate` overloads Python ``&`` (AND) and ``|`` (OR) operators to build a compound predicate from lower level building blocks. + +Therefore, there is nothing that prevents us to combine the predicates into multi-level tests, +such as testing if the variant is a *"chromosomal deletion" or a deletion which removes at least 50 bp*: + +>>> from gpsea.model import VariantClass +>>> chromosomal_deletion = "SO:1000029" +>>> predicate = VariantPredicates.structural_type(chromosomal_deletion) | (VariantPredicates.variant_class(VariantClass.DEL) & VariantPredicates.change_length("<=", -50)) +>>> predicate.get_question() +'(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))' + + +Inverting conditions +-------------------- + +Sometimes we may want to test the variant for a condition that must *not* be met. +For instance, we may want to test if the variant is a deletion +that is *not* predicted to shift the transcript reading frame. +One of doing this would be to build a compound predicates +for all variant effects except of :class:`~gpsea.model.VariantEffect.FRAMESHIFT_VARIANT`: + +>>> non_frameshift_effects = ( +... VariantEffect.SYNONYMOUS_VARIANT, VariantEffect.MISSENSE_VARIANT, VariantEffect.INTRON_VARIANT, +... # and many more effects.. +... ) +>>> non_frameshift_predicate = VariantPredicates.all(VariantPredicates.variant_effect(eff, tx_id=rere_mane_tx_id) for eff in non_frameshift_effects) + +However, this is clearly tedious and it would be much better implemented +by a simple logical not of a predicate for a frameshift variant effect. + +To support this, `VariantPredicate` implements *logical inversion* +which corresponds to Python's ``~`` operator (tilde), to wrap +the underlying predicate and to invert its test result. + +This is how we can use the predicate inversion to build the predicate for non-frameshift deletions: + +>>> non_frameshift_del = ~VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id=rere_mane_tx_id) & VariantPredicates.variant_class(VariantClass.DEL) +>>> non_frameshift_del.get_question() +'(NOT FRAMESHIFT_VARIANT on NM_001042681.2 AND variant class is DEL)' + +Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question. + +The variant predicate offers a flexible API for testing if variants meet a condition. +However, the genotype phenotype correlations are done on the individual level +and the variant predicates are used as a component of the genotype predicate. +The next sections show how to use variant predicates to assign individuals into groups. diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index a96700cf..81b92d32 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,4 +1,4 @@ -"Which genotype group does the patient fit in: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, +"What is the genotype group?: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 55757ef3..4454e596 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -139,7 +139,7 @@ we expect the autosomal dominant mode of inheritance: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) >>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET' +'What is the genotype group?: HOM_REF, HET' `gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group and the patients with one frameshift allele will be assigned into `HET` group. diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 3a930127..c4c82b5f 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -605,7 +605,7 @@ def __init__( ) if issues: raise ValueError("Cannot create predicate: {}".format(", ".join(issues))) - self._question = "Which genotype group does the patient fit in" + self._question = "What is the genotype group?" def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations From 40f3d917c7ab568300cb80f0dccee03d921bfe51 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 6 Sep 2024 14:34:00 +0200 Subject: [PATCH 2/2] Fix some typos, use `HPOPredicate`. --- README.md | 2 +- docs/index.rst | 2 +- docs/report/tbx5_frameshift_vs_missense.csv | 12 ++-- docs/user-guide/predicates.rst | 31 ---------- .../predicates/diagnosis_predicate.rst | 2 +- .../predicates/filtering_predicate.rst | 4 +- .../predicates/genotype_predicates.rst | 2 +- .../predicates/groups_predicate.rst | 6 +- docs/user-guide/predicates/hpo_predicate.rst | 59 ++++++++++++++++--- .../predicates/male_female_predicate.rst | 2 +- .../mode_of_inheritance_predicate.rst | 6 +- docs/user-guide/report/tbx5_frameshift.csv | 10 ++-- .../report/tbx5_frameshift.mtc_report.html | 4 +- docs/user-guide/stats.rst | 2 +- .../predicate/genotype/_gt_predicates.py | 5 +- .../analysis/predicate/phenotype/__init__.py | 4 +- .../analysis/predicate/phenotype/_pheno.py | 8 ++- .../analysis/predicate/phenotype/_util.py | 4 +- tests/analysis/test_mtc_filter.py | 4 +- tests/conftest.py | 12 ++-- tests/test_predicates.py | 8 +-- 21 files changed, 101 insertions(+), 88 deletions(-) diff --git a/README.md b/README.md index 449314b9..c4b00093 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![PyPi downloads](https://img.shields.io/pypi/dm/gpsea.svg?label=Pypi%20downloads) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gpsea) -GPSEA (Genotypes and Ghenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. +GPSEA (Genotypes and Phenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. See the [Tutorial](https://monarch-initiative.github.io/gpsea/stable/tutorial.html) diff --git a/docs/index.rst b/docs/index.rst index b70bf9c2..5bcadd38 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ A key question in biology and human genetics concerns the relationships between genetics, the focus is generally placed on the study of whether specific disease-causing alleles are associated with specific phenotypic manifestations of the disease. -`GPSEA` (Genotypes and Ghenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. +`GPSEA` (Genotypes and Phenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. `gpsea` ingests data from these phenopackets and performs analysis of the correlation of specific variants, variant types (e.g., missense vs. premature termination codon), or variant location in protein motifs or other features. diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 0bc1e5c6..4ab9175a 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,4 +1,4 @@ -"Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,, +Genotype group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05 Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214 @@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.286867598598 Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 -Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 -Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 +Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 -Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,, -Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,, -Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, +Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 +Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,, +Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,, +Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,, diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index ed63f9d3..bfd533b7 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -25,8 +25,6 @@ use the HPO terms to assign a group. All GPSEA analyses need at least one predicate (typically a *genotype* predicate) and many require both *genotype* and *phenotype* predicates. The following pages provide more information. -.. _genotype-predicates: - .. toctree:: @@ -37,35 +35,6 @@ The following pages provide more information. predicates/genotype_predicates - - -.. _groups-predicate: - - - -.. _phenotype-predicates: - - - - -Predicates for all cohort phenotypes -==================================== - -Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious. -The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` -function cuts down the tedium: - ->>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest ->>> pheno_predicates = prepare_predicates_for_terms_of_interest( -... cohort=cohort, -... hpo=hpo, -... ) ->>> len(pheno_predicates) -301 - -and prepares predicates for testing 301 HPO terms of the *RERE* cohort. - - ******* Gallery ******* diff --git a/docs/user-guide/predicates/diagnosis_predicate.rst b/docs/user-guide/predicates/diagnosis_predicate.rst index 6ce811ab..386af20a 100644 --- a/docs/user-guide/predicates/diagnosis_predicate.rst +++ b/docs/user-guide/predicates/diagnosis_predicate.rst @@ -1,4 +1,4 @@ -.. _diagnosis_predicate: +.. _diagnosis-predicate: ======================== Partition by a diagnosis diff --git a/docs/user-guide/predicates/filtering_predicate.rst b/docs/user-guide/predicates/filtering_predicate.rst index 2d47e1d3..a5acef31 100644 --- a/docs/user-guide/predicates/filtering_predicate.rst +++ b/docs/user-guide/predicates/filtering_predicate.rst @@ -35,7 +35,7 @@ to bin according to a genotype group: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) >>> gt_predicate.display_question() -'What is the genotype group?: HOM_REF, HET, BIALLELIC_ALT' +'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' We see that the `gt_predicate` bins the patients into three groups: @@ -53,4 +53,4 @@ that includes only the categories of interest: ... targets=(cats[1], cats[2]), ... ) >>> fgt_predicate.display_question() -'What is the genotype group?: HET, BIALLELIC_ALT' \ No newline at end of file +'What is the genotype group: HET, BIALLELIC_ALT' \ No newline at end of file diff --git a/docs/user-guide/predicates/genotype_predicates.rst b/docs/user-guide/predicates/genotype_predicates.rst index 8a725b6e..70fe391d 100644 --- a/docs/user-guide/predicates/genotype_predicates.rst +++ b/docs/user-guide/predicates/genotype_predicates.rst @@ -1,4 +1,4 @@ -.. _genotype_predicates: +.. _genotype-predicates: =================== Genotype Predicates diff --git a/docs/user-guide/predicates/groups_predicate.rst b/docs/user-guide/predicates/groups_predicate.rst index 23f178ac..a6142897 100644 --- a/docs/user-guide/predicates/groups_predicate.rst +++ b/docs/user-guide/predicates/groups_predicate.rst @@ -1,4 +1,4 @@ -.. _groups_predicate: +.. _groups-predicate: ================ Groups Predicate @@ -7,8 +7,8 @@ Groups Predicate Sometimes, all we want is to compare if there is a difference between individuals -who include one or more alleles of variant $X$ vs. individuals with variants $Y$, -vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates. +who include one or more alleles of variant `X` vs. individuals with variants `Y`, +vs. individuals with variants `Z`, where `X`, `Y` and `Z` are variant predicates. We can do this with a *groups* predicate. The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` diff --git a/docs/user-guide/predicates/hpo_predicate.rst b/docs/user-guide/predicates/hpo_predicate.rst index fada58ec..eaca60dc 100644 --- a/docs/user-guide/predicates/hpo_predicate.rst +++ b/docs/user-guide/predicates/hpo_predicate.rst @@ -1,10 +1,10 @@ -.. _hpo_predicate: +.. _hpo-predicate: -Propagating phenotype predicate -=============================== +HPO predicate +============= -When testing for presence or absence of an HPO term, the propagating phenotype predicate +When testing for presence or absence of an HPO term, the :class:`~gpsea.analysis.predicate.phenotype.HpoPredicate` leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. In result, an individual annotated with a term is implicitly annotated with all its ancestors. For instance, an individual annotated with `Ectopia lentis `_ @@ -15,14 +15,19 @@ is also annotated with `Abnormal lens morphology `_. +We need to load :class:`~hpotk.MinimalOntology` with HPO data to access the HPO hierarchy: + +>>> import hpotk +>>> store = hpotk.configure_ontology_store() +>>> hpo = store.load_minimal_hpo(release='v2024-07-01') + +and now we can set up a predicate to test for presence of *Abnormal lens morphology*: >>> from gpsea.analysis.predicate.phenotype import HpoPredicate >>> query = hpotk.TermId.from_curie('HP:0000517') @@ -41,4 +46,42 @@ missing_implies_phenotype_excluded In many cases, published reports of clinical data about individuals with rare diseases describes phenotypic features that were observed, but do not provide a comprehensive list of features that were explicitly excluded. By default, GPSEA will only include features that are recorded as observed or excluded in a phenopacket. Setting this argument to True will cause "n/a" entries to be set to "excluded". We provide this option for exploration but do not recommend its use for the -final analysis unless the assumption behind it is known to be true. \ No newline at end of file +final analysis unless the assumption behind it is known to be true. + + + +Predicates for all cohort phenotypes +==================================== + +Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious. +The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` +function cuts down the tedium. + +For a given phenopacket collection (e.g. 156 patients with mutations in *WWOX* gene included in Phenopacket Store version `0.1.18`) + +>>> from ppktstore.registry import configure_phenopacket_registry +>>> registry = configure_phenopacket_registry() +>>> with registry.open_phenopacket_store(release='0.1.18') as ps: +... phenopackets = tuple(ps.iter_cohort_phenopackets('TBX5')) +>>> len(phenopackets) +156 + +processed into a cohort + +>>> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets +>>> cohort_creator = configure_caching_cohort_creator(hpo) +>>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE +Patients Created: ... + + +we can create HPO predicates for testing all 260 HPO terms used in the cohort + +>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest +>>> pheno_predicates = prepare_predicates_for_terms_of_interest( +... cohort=cohort, +... hpo=hpo, +... ) +>>> len(pheno_predicates) +260 + +and subject the predicates into further analysis, such as :class:`~gpsea.analysis.pcats.HpoTermAnalysis`. diff --git a/docs/user-guide/predicates/male_female_predicate.rst b/docs/user-guide/predicates/male_female_predicate.rst index 8f16c96b..3bed237e 100644 --- a/docs/user-guide/predicates/male_female_predicate.rst +++ b/docs/user-guide/predicates/male_female_predicate.rst @@ -1,4 +1,4 @@ -.. _male_female_predicate: +.. _male-female-predicate: Partition by the sex of the individual ====================================== diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst index a00570d0..2efb4d6b 100644 --- a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -1,4 +1,4 @@ -.. _mode_of_inheritance_predicate: +.. _mode-of-inheritance-predicate: ============================== Mode of Inheritance Predicates @@ -92,6 +92,6 @@ for assigning a patient into a genotype group: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) >>> gt_predicate.display_question() -'What is the genotype group?: HOM_REF, HET, BIALLELIC_ALT' +'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' -The `gt_predicate` can be used in downstream analysis, such as in :class: +The `gt_predicate` can be used in downstream analysis, such as in :class:`~gpsea.analysis.pcats.HpoTermAnalysis`. diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index 81b92d32..4851c198 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,4 +1,4 @@ -"What is the genotype group?: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, +What is the genotype group,HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 @@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.144002047919 Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.6899307928951143,0.4666666666666667 Short thumb [HP:0009778],25/69,36%,8/30,27%,0.6899307928951143,0.48700997145537483 Absent radius [HP:0003974],9/43,21%,6/25,24%,1.0,0.7703831604944444 -Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 Hypoplasia of the radius [HP:0002984],34/75,45%,6/14,43%,1.0,1.0 Hypoplasia of the ulna [HP:0003022],3/17,18%,2/10,20%,1.0,1.0 Short humerus [HP:0005792],8/21,38%,4/9,44%,1.0,1.0 -Abnormal atrial septum morphology [HP:0011994],64/64,100%,20/20,100%,, -Abnormal cardiac septum morphology [HP:0001671],89/89,100%,28/28,100%,, -Abnormal heart morphology [HP:0001627],89/89,100%,30/30,100%,, +Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 +Aplasia/Hypoplasia of the thumb [HP:0009601],40/40,100%,19/19,100%,, +Aplasia/Hypoplasia of fingers [HP:0006265],44/44,100%,19/19,100%,, +Aplasia/hypoplasia involving bones of the hand [HP:0005927],44/44,100%,19/19,100%,, diff --git a/docs/user-guide/report/tbx5_frameshift.mtc_report.html b/docs/user-guide/report/tbx5_frameshift.mtc_report.html index fbc4a9de..3d00300a 100644 --- a/docs/user-guide/report/tbx5_frameshift.mtc_report.html +++ b/docs/user-guide/report/tbx5_frameshift.mtc_report.html @@ -103,14 +103,14 @@

Phenotype testing report

TODO - Skipping term with only 5 observations (not powered for 2x2) + Skipping term with maximum frequency that was less than threshold 0.2 10 TODO - Skipping term with maximum frequency that was less than threshold 0.2 + Skipping term with only 5 observations (not powered for 2x2) 10 diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 4454e596..59ad7604 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -139,7 +139,7 @@ we expect the autosomal dominant mode of inheritance: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) >>> gt_predicate.display_question() -'What is the genotype group?: HOM_REF, HET' +'What is the genotype group: HOM_REF, HET' `gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group and the patients with one frameshift allele will be assigned into `HET` group. diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index c4c82b5f..76eebcc5 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,4 +1,3 @@ -from cProfile import label import dataclasses import enum import typing @@ -605,7 +604,7 @@ def __init__( ) if issues: raise ValueError("Cannot create predicate: {}".format(", ".join(issues))) - self._question = "What is the genotype group?" + self._question = "What is the genotype group" def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -727,7 +726,7 @@ def sex_predicate() -> GenotypePolyPredicate: """ Get a genotype predicate for categorizing patients by their :class:`~gpsea.model.Sex`. - See the :ref:`sex-predicate` section for an example. + See the :ref:`male-female-predicate` section for an example. """ return INSTANCE diff --git a/src/gpsea/analysis/predicate/phenotype/__init__.py b/src/gpsea/analysis/predicate/phenotype/__init__.py index 2efa1e55..a70f954b 100644 --- a/src/gpsea/analysis/predicate/phenotype/__init__.py +++ b/src/gpsea/analysis/predicate/phenotype/__init__.py @@ -6,13 +6,13 @@ or using the phenotype features encoded into HPO terms (:class:`PropagatingPhenotypePredicate`). """ -from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from ._pheno import PhenotypePolyPredicate, HpoPredicate from ._pheno import DiseasePresencePredicate from ._pheno import PhenotypeCategorization, P from ._util import prepare_predicates_for_terms_of_interest, prepare_hpo_terms_of_interest __all__ = [ - 'PhenotypePolyPredicate', 'PropagatingPhenotypePredicate', + 'PhenotypePolyPredicate', 'HpoPredicate', 'DiseasePresencePredicate', 'PhenotypeCategorization', 'P', 'prepare_predicates_for_terms_of_interest', 'prepare_hpo_terms_of_interest', diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 3b2d9a7b..4cb8a804 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -92,13 +92,15 @@ def present_phenotype_category(self) -> PatientCategory: return self.present_phenotype_categorization.category -class PropagatingPhenotypePredicate(PhenotypePolyPredicate[hpotk.TermId]): +class HpoPredicate(PhenotypePolyPredicate[hpotk.TermId]): """ - `PropagatingPhenotypePredicate` tests if a patient is annotated with an HPO term. + `HpoPredicate` tests if a patient is annotated with an HPO term. Note, `query` must be a term of the provided `hpo`! - :param hpo: HPO object + See :ref:`hpo-predicate` section for an example usage. + + :param hpo: HPO ontology :param query: the HPO term to test :param missing_implies_phenotype_excluded: `True` if lack of an explicit annotation implies term's absence`. """ diff --git a/src/gpsea/analysis/predicate/phenotype/_util.py b/src/gpsea/analysis/predicate/phenotype/_util.py index ac73dd02..1a0e33d5 100644 --- a/src/gpsea/analysis/predicate/phenotype/_util.py +++ b/src/gpsea/analysis/predicate/phenotype/_util.py @@ -4,7 +4,7 @@ import hpotk -from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from ._pheno import PhenotypePolyPredicate, HpoPredicate from gpsea.model import Patient @@ -26,7 +26,7 @@ def prepare_predicates_for_terms_of_interest( (either directly or indirectly) for the term to be included in the analysis. """ return tuple( - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=term, missing_implies_phenotype_excluded=missing_implies_excluded, diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index d904cfe4..02710efd 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -8,7 +8,7 @@ from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.analysis.pcats import apply_predicates_on_patients from gpsea.model import Cohort @@ -55,7 +55,7 @@ def ph_predicate( For the purpose of testing counts, let's pretend the counts were created by this predicate. """ - return PropagatingPhenotypePredicate( + return HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001250"), # Seizure missing_implies_phenotype_excluded=False, diff --git a/tests/conftest.py b/tests/conftest.py index b66b55e7..26683724 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import pytest from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.io import GpseaJSONEncoder, GpseaJSONDecoder from gpsea.model import * from gpsea.model.genome import GRCh38, GenomicRegion, Region, Strand, GenomeBuild @@ -128,23 +128,23 @@ def suox_pheno_predicates( Note, these are just a *SUBSET* of all phenotypes that can be tested for in the *SUOX* cohort. """ return ( - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001250'), # Seizure ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001083'), # Ectopia lentis ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0032350'), # Sulfocysteinuria ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0012758'), # Neurodevelopmental delay ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001276'), # Hypertonia ), diff --git a/tests/test_predicates.py b/tests/test_predicates.py index c87ce0ad..467a5df1 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -2,7 +2,7 @@ import pytest from gpsea.analysis.predicate import PatientCategory, PatientCategories -from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate, DiseasePresencePredicate +from gpsea.analysis.predicate.phenotype import HpoPredicate, DiseasePresencePredicate from gpsea.analysis.predicate.genotype import * from gpsea.model import Cohort, Patient, FeatureType, VariantEffect from gpsea.model.genome import Region @@ -16,7 +16,7 @@ def find_patient(pat_id: str, cohort: Cohort) -> Patient: raise ValueError(f'Could not find patient {pat_id}') -class TestPropagatingPhenotypeBooleanPredicate: +class TestHpoPredicate: @pytest.mark.parametrize('curie, patient_id, expected', # Patient "HetSingleVar" has Phenotypes: @@ -46,7 +46,7 @@ def test_phenotype_predicate__present_or_excluded( ): patient = find_patient(patient_id, toy_cohort) term_id = hpotk.TermId.from_curie(curie) - predicate = PropagatingPhenotypePredicate(hpo=hpo, query=term_id) + predicate = HpoPredicate(hpo=hpo, query=term_id) actual = predicate.test(patient) assert actual.phenotype == term_id @@ -60,7 +60,7 @@ def test_phenotype_predicate__unknown( # Not Measured and not Observed - 'HP:0006280', # Chronic pancreatitis patient = find_patient('HetSingleVar', toy_cohort) term_id = hpotk.TermId.from_curie('HP:0006280') - predicate = PropagatingPhenotypePredicate(hpo=hpo, query=term_id) + predicate = HpoPredicate(hpo=hpo, query=term_id) actual = predicate.test(patient) assert actual is None